List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w.j a v a 2 s . co m*/ addOutputOption(); addOption("lambda", null, "regularization parameter", true); addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false)); addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40)); addOption("numFeatures", null, "dimension of the feature space", true); addOption("numIterations", null, "number of iterations", true); addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1)); addOption("usesLongIDs", null, "input contains long IDs that need to be translated"); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } numFeatures = Integer.parseInt(getOption("numFeatures")); numIterations = Integer.parseInt(getOption("numIterations")); lambda = Double.parseDouble(getOption("lambda")); alpha = Double.parseDouble(getOption("alpha")); implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback")); numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver")); usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false))); /* * compute the factorization A = U M' * * where A (users x items) is the matrix of known ratings * U (users x features) is the representation of users in the feature space * M (items x features) is the representation of items in the feature space */ if (usesLongIDs) { Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class, MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS)); mapUsers.waitForCompletion(true); Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class, MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS)); mapItems.waitForCompletion(true); } /* create A' */ Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); itemRatings.setCombinerClass(VectorSumCombiner.class); itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs)); boolean succeeded = itemRatings.waitForCompletion(true); if (!succeeded) { return -1; } /* create A */ Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class, IntWritable.class, VectorWritable.class); userRatings.setCombinerClass(MergeVectorsCombiner.class); succeeded = userRatings.waitForCompletion(true); if (!succeeded) { return -1; } //TODO this could be fiddled into one of the upper jobs Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"), AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageItemRatings.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageItemRatings.waitForCompletion(true); if (!succeeded) { return -1; } Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf()); numItems = averageRatings.getNumNondefaultElements(); numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue(); log.info("Found {} users and {} items", numUsers, numItems); /* create an initial M */ initializeM(averageRatings); for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) { /* broadcast M, read A row-wise, recompute U row-wise */ log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations); runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1), currentIteration, "U", numItems); /* broadcast U, read A' row-wise, recompute M row-wise */ log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations); runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration, "M", numUsers); } return 0; }
From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelMRPJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*w w w . j a v a 2 s.c o m*/ addOutputOption(); addOption("numFeatures", null, "dimension of the feature space", true); addOption("numIterations", null, "number of iterations", true); addOption("lambda_a", null, "regularization parameter", true); addOption("lambda_fg", null, "regularization parameter", true); addOption("lambda_fu", null, "regularization parameter", true); addOption("lambda_lg", null, "regularization parameter", true); addOption("lambda_lu", null, "regularization parameter", true); addOption("lambda_g", null, "regularization parameter", true); addOption("lambda_u", null, "regularization parameter", true); addOption("dimFeatureUser", null, "dimension of user feautre", true); addOption("dimUserRelation", null, "dimension of geography feautre", true); addOption("userRelationPath", null, "geography features file", true); addOption("featureUserPath", null, "user features file", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } numFeatures = Integer.parseInt(getOption("numFeatures")); numIterations = Integer.parseInt(getOption("numIterations")); this.lambda_a = Double.parseDouble(getOption("lambda_a")); this.lambda_fg = Double.parseDouble(getOption("lambda_fg")); this.lambda_fu = Double.parseDouble(getOption("lambda_fu")); this.lambda_lg = Double.parseDouble(getOption("lambda_lg")); this.lambda_lu = Double.parseDouble(getOption("lambda_lu")); this.lambda_g = Double.parseDouble(getOption("lambda_g")); this.lambda_u = Double.parseDouble(getOption("lambda_u")); this.dimUserRelation = Integer.parseInt(getOption("dimUserRelation")); this.dimFeatureUser = Integer.parseInt(getOption("dimFeatureUser")); this.userRelationPath = getOption("userRelationPath"); this.featureUserPath = getOption("featureUserPath"); /* create A */ Job userRatings = prepareJob(getInputPath(), pathToUserRatings(), TextInputFormat.class, InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); userRatings.setCombinerClass(VectorSumReducer.class); boolean succeeded = userRatings.waitForCompletion(true); if (!succeeded) return -1; /* create A' */ Job itemRatings = prepareJob(pathToUserRatings(), pathToItemRatings(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); itemRatings.setCombinerClass(MergeVectorsCombiner.class); succeeded = itemRatings.waitForCompletion(true); if (!succeeded) return -1; /* create U ,this is a symetric matrix, so no need to compute U' */ Job userRelation = prepareJob(getUserRelationPath(), pathToUserRelation(), TextInputFormat.class, InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); userRelation.setCombinerClass(VectorSumReducer.class); succeeded = userRelation.waitForCompletion(true); if (!succeeded) return -1; /* create Fu */ Job featureUser = prepareJob(getFeatureUserPath(), pathToFeatureUser(), TextInputFormat.class, InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); featureUser.setCombinerClass(VectorSumReducer.class); succeeded = featureUser.waitForCompletion(true); if (!succeeded) return -1; /* create Fu' */ Job featureUserTranspose = prepareJob(pathToFeatureUser(), pathToFeatureUserTranspose(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); featureUserTranspose.setCombinerClass(MergeVectorsCombiner.class); succeeded = featureUserTranspose.waitForCompletion(true); if (!succeeded) return -1; /* Get some average values for initialization. */ Job averageGeoFeatureValue = prepareJob(pathToUserRelation(), getTempPath("averageGeoFeatureValue"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageGeoFeatureValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageGeoFeatureValue.waitForCompletion(true); if (!succeeded) return -1; Job averageGeoFeatureTransposeValue = prepareJob(pathToUserRelationTranspose(), getTempPath("averageGeoFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageGeoFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageGeoFeatureTransposeValue.waitForCompletion(true); if (!succeeded) return -1; Job averageUserFeatureValue = prepareJob(pathToFeatureUser(), getTempPath("averageUserFeatureValue"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageUserFeatureValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageUserFeatureValue.waitForCompletion(true); if (!succeeded) return -1; Job averageUserFeatureTransposeValue = prepareJob(pathToFeatureUserTranspose(), getTempPath("averageUserFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageUserFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageUserFeatureTransposeValue.waitForCompletion(true); if (!succeeded) return -1; Vector averageUserValue = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValue"), getConf()); Vector averageGeoValue = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValue"), getConf()); Vector averageUserValueTranspose = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValueTranspose"), getConf()); Vector averageGeoValueTranspose = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValueTranspose"), getConf()); /* create an initial Lu Lg */ initialize(averageUserValue, pathToLu(-1)); initialize(averageGeoValue, pathToLg(-1)); initialize(averageUserValueTranspose, pathToU(-1)); initialize(averageGeoValueTranspose, pathToG(-1)); for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) { /* broadcast Lu, read A' Fu' Fg', recompute Lg */ log.info("Recompute Lg via A (iteration {}/{})", currentIteration, numIterations); runSolver(pathToItemRatings(), pathToLu(currentIteration - 1), pathToLgviaA(currentIteration), this.lambda_a, this.lambda_lg); /* broadcast G, read Fg row-wise, recompute Lg */ log.info("Recompute Lg via Fg (iteration {}/{})", currentIteration, numIterations); runSolver(pathToG(currentIteration - 1), pathToUserRelation(), pathToLgviaF(currentIteration), this.lambda_fg, this.lambda_lg); /* merge Lg */ log.info("Merge Lg together (iteration {}/{})", currentIteration, numIterations); mergeLuorLg(pathToLgviaA(currentIteration), pathToLgviaF(currentIteration), pathToLg(currentIteration)); /* broadcast Lg, read A row-wise, recompute Lu */ log.info("Recompute Lu via A (iteration {}/{})", currentIteration, numIterations); runSolver(pathToUserRatings(), pathToLg(currentIteration), pathToLuviaA(currentIteration), this.lambda_a, this.lambda_lu); /* broadcast U, read Fu row-wise, recompute Lu */ log.info("Recompute Lu via Fu (iteration {}/{})", currentIteration, numIterations); runSolver(pathToU(currentIteration - 1), pathToFeatureUser(), pathToLuviaF(currentIteration), this.lambda_fu, this.lambda_lu); /* merge Lu */ log.info("Merge Lu together (iteration {}/{})", currentIteration, numIterations); mergeLuorLg(pathToLuviaA(currentIteration), pathToLuviaF(currentIteration), pathToLu(currentIteration)); /* broadcast Lg, read Fg, recompute G */ log.info("Recompute G via Fg' (interation {}/{})", currentIteration, numIterations); runSolver(pathToLg(currentIteration), pathToUserRelationTranspose(), pathToG(currentIteration), this.lambda_fg, this.lambda_g); /* broadcast Lu, read Fu, recompute U */ log.info("Recompute U via Fu (iteration {}/{})", currentIteration, numIterations); runSolver(pathToLu(currentIteration), pathToFeatureUserTranspose(), pathToU(currentIteration), this.lambda_fu, this.lambda_u); } return 0; }
From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelMRPJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from www .ja v a 2 s.c o m*/ addOutputOption(); addOption("numFeatures", null, "dimension of the feature space", true); addOption("numIterations", null, "number of iterations", true); addOption("lambda_a", null, "regularization parameter", true); addOption("lambda_fg", null, "regularization parameter", true); addOption("lambda_fu", null, "regularization parameter", true); addOption("lambda_lg", null, "regularization parameter", true); addOption("lambda_lu", null, "regularization parameter", true); addOption("lambda_g", null, "regularization parameter", true); addOption("lambda_u", null, "regularization parameter", true); addOption("dimFeatureUser", null, "dimension of user feautre", true); addOption("dimFeatureGeo", null, "dimension of geography feautre", true); addOption("featureGeoPath", null, "geography features file", true); addOption("featureUserPath", null, "user features file", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } numFeatures = Integer.parseInt(getOption("numFeatures")); numIterations = Integer.parseInt(getOption("numIterations")); this.lambda_a = Double.parseDouble(getOption("lambda_a")); this.lambda_fg = Double.parseDouble(getOption("lambda_fg")); this.lambda_fu = Double.parseDouble(getOption("lambda_fu")); this.lambda_lg = Double.parseDouble(getOption("lambda_lg")); this.lambda_lu = Double.parseDouble(getOption("lambda_lu")); this.lambda_g = Double.parseDouble(getOption("lambda_g")); this.lambda_u = Double.parseDouble(getOption("lambda_u")); this.dimFeatureGeo = Integer.parseInt(getOption("dimFeatureGeo")); this.dimFeatureUser = Integer.parseInt(getOption("dimFeatureUser")); this.featureGeoPath = getOption("featureGeoPath"); this.featureUserPath = getOption("featureUserPath"); /* create A */ Job userRatings = prepareJob(getInputPath(), pathToUserRatings(), TextInputFormat.class, InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); userRatings.setCombinerClass(VectorSumReducer.class); boolean succeeded = userRatings.waitForCompletion(true); if (!succeeded) return -1; /* create A' */ Job itemRatings = prepareJob(pathToUserRatings(), pathToItemRatings(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); itemRatings.setCombinerClass(MergeVectorsCombiner.class); succeeded = itemRatings.waitForCompletion(true); if (!succeeded) return -1; /* create Fg */ Job featureGeo = prepareJob(getFeatureGeoPath(), pathToFeatureGeo(), TextInputFormat.class, InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); featureGeo.setCombinerClass(VectorSumReducer.class); succeeded = featureGeo.waitForCompletion(true); if (!succeeded) return -1; /* create Fg' */ Job featureGeoTranspose = prepareJob(pathToFeatureGeo(), pathToFeatureGeoTranspose(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); featureGeoTranspose.setCombinerClass(MergeVectorsCombiner.class); succeeded = featureGeoTranspose.waitForCompletion(true); if (!succeeded) return -1; /* create Fu */ Job featureUser = prepareJob(getFeatureUserPath(), pathToFeatureUser(), TextInputFormat.class, InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); featureUser.setCombinerClass(VectorSumReducer.class); succeeded = featureUser.waitForCompletion(true); if (!succeeded) return -1; /* create Fu' */ Job featureUserTranspose = prepareJob(pathToFeatureUser(), pathToFeatureUserTranspose(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); featureUserTranspose.setCombinerClass(MergeVectorsCombiner.class); succeeded = featureUserTranspose.waitForCompletion(true); if (!succeeded) return -1; /* Get some average values for initialization. */ Job averageGeoFeatureValue = prepareJob(pathToFeatureGeo(), getTempPath("averageGeoFeatureValue"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageGeoFeatureValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageGeoFeatureValue.waitForCompletion(true); if (!succeeded) return -1; Job averageGeoFeatureTransposeValue = prepareJob(pathToFeatureGeoTranspose(), getTempPath("averageGeoFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageGeoFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageGeoFeatureTransposeValue.waitForCompletion(true); if (!succeeded) return -1; Job averageUserFeatureValue = prepareJob(pathToFeatureUser(), getTempPath("averageUserFeatureValue"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageUserFeatureValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageUserFeatureValue.waitForCompletion(true); if (!succeeded) return -1; Job averageUserFeatureTransposeValue = prepareJob(pathToFeatureUserTranspose(), getTempPath("averageUserFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageUserFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class); succeeded = averageUserFeatureTransposeValue.waitForCompletion(true); if (!succeeded) return -1; Vector averageUserValue = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValue"), getConf()); Vector averageGeoValue = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValue"), getConf()); Vector averageUserValueTranspose = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValueTranspose"), getConf()); Vector averageGeoValueTranspose = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValueTranspose"), getConf()); /* create an initial Lu Lg */ initialize(averageUserValue, pathToLu(-1)); initialize(averageGeoValue, pathToLg(-1)); initialize(averageUserValueTranspose, pathToU(-1)); initialize(averageGeoValueTranspose, pathToG(-1)); for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) { /* broadcast Lu, read A' Fu' Fg', recompute Lg */ log.info("Recompute Lg via A (iteration {}/{})", currentIteration, numIterations); runSolver(pathToItemRatings(), pathToLu(currentIteration - 1), pathToLgviaA(currentIteration), this.lambda_a, this.lambda_lg); /* broadcast G, read Fg row-wise, recompute Lg */ log.info("Recompute Lg via Fg (iteration {}/{})", currentIteration, numIterations); runSolver(pathToG(currentIteration - 1), pathToFeatureGeo(), pathToLgviaF(currentIteration), this.lambda_fg, this.lambda_lg); /* merge Lg */ log.info("Merge Lg together (iteration {}/{})", currentIteration, numIterations); mergeLuorLg(pathToLgviaA(currentIteration), pathToLgviaF(currentIteration), pathToLg(currentIteration)); /* broadcast Lg, read A row-wise, recompute Lu */ log.info("Recompute Lu via A (iteration {}/{})", currentIteration, numIterations); runSolver(pathToUserRatings(), pathToLg(currentIteration), pathToLuviaA(currentIteration), this.lambda_a, this.lambda_lu); /* broadcast U, read Fu row-wise, recompute Lu */ log.info("Recompute Lu via Fu (iteration {}/{})", currentIteration, numIterations); runSolver(pathToU(currentIteration - 1), pathToFeatureUser(), pathToLuviaF(currentIteration), this.lambda_fu, this.lambda_lu); /* merge Lu */ log.info("Merge Lu together (iteration {}/{})", currentIteration, numIterations); mergeLuorLg(pathToLuviaA(currentIteration), pathToLuviaF(currentIteration), pathToLu(currentIteration)); /* broadcast Lg, read Fg, recompute G */ log.info("Recompute G via Fg' (interation {}/{})", currentIteration, numIterations); runSolver(pathToLg(currentIteration), pathToFeatureGeoTranspose(), pathToG(currentIteration), this.lambda_fg, this.lambda_g); /* broadcast Lu, read Fu, recompute U */ log.info("Recompute U via Fu (iteration {}/{})", currentIteration, numIterations); runSolver(pathToLu(currentIteration), pathToFeatureUserTranspose(), pathToU(currentIteration), this.lambda_fu, this.lambda_u); } return 0; }
From source file:org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/* ww w . ja va2 s . c o m*/ addOutputOption(); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("ratingShift", "rs", "shift ratings by this value", "0.0"); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); boolean booleanData = Boolean.valueOf(getOption("booleanData")); float ratingShift = Float.parseFloat(getOption("ratingShift")); //convert items to an internal index Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); boolean succeeded = itemIDIndex.waitForCompletion(true); if (!succeeded) { return -1; } //convert user preferences into a vector per user Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorsReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift)); succeeded = toUserVectors.waitForCompletion(true); if (!succeeded) { return -1; } //we need the number of users later int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS) .getValue(); HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf()); //build the rating matrix Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX), ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class); toItemVectors.setCombinerClass(ToItemVectorsReducer.class); succeeded = toItemVectors.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }
From source file:org.apache.mahout.cf.taste.hbase.preparation.PreparePreferenceMatrixJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();// w w w . ja va 2s . c o m addOutputOption(); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("ratingShift", "rs", "shift ratings by this value", "0.0"); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); boolean booleanData = Boolean.valueOf(getOption("booleanData")); float ratingShift = Float.parseFloat(getOption("ratingShift")); String workingTable = getConf().get(RecommenderJob.PARAM_WORKING_TABLE); String cfRatings = getConf().get(RecommenderJob.PARAM_CF_RATINGS); //convert items to an internal index Configuration mapred_config = HBaseConfiguration.create(); mapred_config.setBoolean("mapred.compress.map.output", true); mapred_config.set(RecommenderJob.PARAM_CF_RATINGS, cfRatings); Job itemIDIndex = Job.getInstance(mapred_config); itemIDIndex.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), itemIDIndex, ItemIDIndexMapper.class, ItemIDIndexReducer.class)); itemIDIndex.setJarByClass(ItemIDIndexMapper.class); // class that contains mapper and reducer Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs // set other scan attrs TableMapReduceUtil.initTableMapperJob(workingTable, // input table scan, // Scan instance to control CF and attribute selection ItemIDIndexMapper.class, // mapper class VarIntWritable.class, // mapper output key VarLongWritable.class, // mapper output value itemIDIndex); itemIDIndex.setReducerClass(ItemIDIndexReducer.class); // reducer class itemIDIndex.setOutputKeyClass(VarIntWritable.class); itemIDIndex.setOutputValueClass(VarLongWritable.class); itemIDIndex.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(itemIDIndex, getOutputPath(ITEMID_INDEX)); // adjust directories as required if (!itemIDIndex.waitForCompletion(true)) return -1; ////////////////////////////////////////////////////////////////////////// //convert user preferences into a vector per user mapred_config.setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); mapred_config.setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); mapred_config.set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift)); Job toUserVectors_hb = Job.getInstance(mapred_config); toUserVectors_hb.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), toUserVectors_hb, ToItemPrefsMapper.class, ToUserVectorsReducer.class)); toUserVectors_hb.setJarByClass(ToItemPrefsMapper.class); // class that contains mapper and reducer TableMapReduceUtil.initTableMapperJob(workingTable, // input table scan, // Scan instance to control CF and attribute selection ToItemPrefsMapper.class, // mapper class VarLongWritable.class, // mapper output key booleanData ? VarLongWritable.class : EntityPrefWritable.class, // mapper output value toUserVectors_hb); toUserVectors_hb.setReducerClass(ToUserVectorsReducer.class); // reducer class toUserVectors_hb.setNumReduceTasks(1); // at least one, adjust as required toUserVectors_hb.setOutputKeyClass(VarLongWritable.class); toUserVectors_hb.setOutputValueClass(VectorWritable.class); toUserVectors_hb.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(toUserVectors_hb, getOutputPath(USER_VECTORS)); // adjust directories as required if (!toUserVectors_hb.waitForCompletion(true)) return -1; ////////////////////////////////////////////////////////////////////////// //we need the number of users later int numberOfUsers = (int) toUserVectors_hb.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS) .getValue(); HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf()); //build the rating matrix Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX), ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class); toItemVectors.setCombinerClass(ToItemVectorsReducer.class); if (!toItemVectors.waitForCompletion(true)) return -1; return 0; }
From source file:org.apache.mahout.classifier.naivebayes.trainer.NaiveBayesTrainer.java
License:Apache License
private static void runNaiveBayesByLabelSummer(Path input, Configuration conf, Path labelMapPath, Path output, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); DistributedCache.setCacheFiles(new URI[] { labelMapPath.toUri() }, conf); Job job = new Job(conf); job.setJobName(//from ww w .jav a 2s . c om "Train Naive Bayes: input-folder: " + input + ", label-map-file: " + labelMapPath.toString()); job.setJarByClass(NaiveBayesTrainer.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(NaiveBayesInstanceMapper.class); job.setCombinerClass(NaiveBayesSumReducer.class); job.setReducerClass(NaiveBayesSumReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); job.waitForCompletion(true); }
From source file:org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w. j a v a2 s . c o m*/ addOutputOption(); addOption(LABELS, "l", "comma-separated list of labels to include in training", false); addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, "")); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); } Path labPath; String labPathStr = getOption(LABEL_INDEX); if (labPathStr != null) { labPath = new Path(labPathStr); } else { labPath = getTempPath(LABEL_INDEX); } long labelSize = createLabelIndex(labPath); float alphaI = Float.parseFloat(getOption(ALPHA_I)); boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); // Add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } // Sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } // Put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); if (trainComplementary) { // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; } } // Put the per label theta normalizers into the cache HadoopUtil.cacheFiles(getTempPath(THETAS), getConf()); // Validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(getOutputPath(), getConf()); return 0; }
From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java
License:Apache License
/** * Fintune using map/reduce./*ww w . j a v a 2 s. c o m*/ * * @param batch the batch * @param iteration the iteration * @param learningrate the learningrate * @return true, if successful * @throws IOException Signals that an I/O exception has occurred. * @throws InterruptedException the interrupted exception * @throws ClassNotFoundException the class not found exception */ private boolean fintuneMR(Path batch, int iteration, double learningrate) throws IOException, InterruptedException, ClassNotFoundException { //prepare and run finetune job long batchsize; HadoopUtil.delete(getConf(), getTempPath(WEIGHT_UPDATES)); HadoopUtil.cacheFiles(getOutputPath(), getConf()); Job trainDBM = prepareJob(batch, getTempPath(WEIGHT_UPDATES), SequenceFileInputFormat.class, DBMBackPropTrainingMapper.class, IntWritable.class, MatrixWritable.class, DBMBackPropTrainingReducer.class, IntWritable.class, MatrixWritable.class, SequenceFileOutputFormat.class); trainDBM.getConfiguration().set("labelcount", String.valueOf(labelcount)); trainDBM.getConfiguration().set("learningrate", String.valueOf(learningrate)); trainDBM.setCombinerClass(DBMBackPropTrainingReducer.class); if (!trainDBM.waitForCompletion(true)) return false; batchsize = trainDBM.getCounters().findCounter(DBMBackPropTrainingMapper.BATCHES.SIZE).getValue(); changeAndSaveModel(getOutputPath(), batchsize, (iteration == 0) ? 0 : momentum); return true; }
From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java
License:Apache License
/** * Train greedy mr./* w w w . j av a 2 s .c o m*/ * * @param rbmNr the rbm nr * @param batch the batch * @param iteration the iteration * @param learningrate the learningrate * @return true, if successful * @throws IOException Signals that an I/O exception has occurred. * @throws InterruptedException the interrupted exception * @throws ClassNotFoundException the class not found exception */ private boolean trainGreedyMR(int rbmNr, Path batch, int iteration, double learningrate) throws IOException, InterruptedException, ClassNotFoundException { //run greedy pretraining as map reduce job long batchsize; HadoopUtil.delete(getConf(), getTempPath(WEIGHT_UPDATES)); HadoopUtil.cacheFiles(getOutputPath(), getConf()); Job trainRBM = prepareJob(batch, getTempPath(WEIGHT_UPDATES), SequenceFileInputFormat.class, RBMGreedyPreTrainingMapper.class, IntWritable.class, MatrixWritable.class, RBMGreedyPreTrainingReducer.class, IntWritable.class, MatrixWritable.class, SequenceFileOutputFormat.class); trainRBM.getConfiguration().set("rbmNr", String.valueOf(rbmNr)); trainRBM.getConfiguration().set("labelcount", String.valueOf(labelcount)); trainRBM.getConfiguration().set("learningrate", String.valueOf(learningrate)); trainRBM.getConfiguration().set("nrGibbsSampling", String.valueOf(nrGibbsSampling)); trainRBM.setCombinerClass(RBMGreedyPreTrainingReducer.class); if (!trainRBM.waitForCompletion(true)) return false; batchsize = trainRBM.getCounters().findCounter(RBMGreedyPreTrainingMapper.BATCH.SIZE).getValue(); changeAndSaveModel(getOutputPath(), batchsize, (lastUpdate[rbmNr] == null) ? 0 : momentum); return true; }
From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchDriver.java
License:Apache License
/** * Run one iteration of the Baum-Welch Map Reduce algorithm using the supplied arguments * * @param conf the Configuration to use * @param input the Path to the directory containing input * @param modelIn the Path to the HmmModel * @param modelOut the Path to the output directory * @param hiddenStateToIdMap the Path to the map of hidden states to ids * @param emittedStateToIdMap the Path to the map of emitted states to ids * @param numHidden the number of Hidden states * @param numObserved the number of Observed states * @param scaling name of the scaling method * @param delta the convergence delta value * @return true or false depending on convergence check *//*w ww . j a v a 2s. c o m*/ private static boolean runIteration(Configuration conf, Path input, Path modelIn, Path modelOut, Path hiddenStateToIdMap, Path emittedStateToIdMap, int numHidden, int numObserved, String scaling, String delta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(BaumWelchConfigKeys.EMITTED_STATES_MAP_PATH, emittedStateToIdMap.toString()); conf.set(BaumWelchConfigKeys.HIDDEN_STATES_MAP_PATH, hiddenStateToIdMap.toString()); conf.set(BaumWelchConfigKeys.SCALING_OPTION_KEY, scaling); conf.set(BaumWelchConfigKeys.MODEL_PATH_KEY, modelIn.toString()); conf.set(BaumWelchConfigKeys.NUMBER_OF_HIDDEN_STATES_KEY, ((Integer) numHidden).toString()); conf.set(BaumWelchConfigKeys.NUMBER_OF_EMITTED_STATES_KEY, ((Integer) numObserved).toString()); conf.set(BaumWelchConfigKeys.MODEL_CONVERGENCE_KEY, delta); Job job = new Job(conf, "Baum-Welch Driver running runIteration over modelIn: " + conf.get(BaumWelchConfigKeys.MODEL_PATH_KEY)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(MapWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(BaumWelchMapper.class); job.setCombinerClass(BaumWelchCombiner.class); job.setReducerClass(BaumWelchReducer.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, modelOut); job.setJarByClass(BaumWelchDriver.class); HadoopUtil.delete(conf, modelOut); if (!job.waitForCompletion(true)) { throw new InterruptedException("Baum-Welch Iteration failed processing " + modelIn); } return isConverged(modelIn, modelOut, numHidden, numObserved, conf); }