Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  w w w.j  a v a  2 s  .  co  m*/
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1));
    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    lambda = Double.parseDouble(getOption("lambda"));
    alpha = Double.parseDouble(getOption("alpha"));
    implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));

    numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver"));
    usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false)));

    /*
    * compute the factorization A = U M'
    *
    * where A (users x items) is the matrix of known ratings
    *           U (users x features) is the representation of users in the feature space
    *           M (items x features) is the representation of items in the feature space
    */

    if (usesLongIDs) {
        Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS));
        mapUsers.waitForCompletion(true);

        Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS));
        mapItems.waitForCompletion(true);
    }

    /* create A' */
    Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class,
            ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    itemRatings.setCombinerClass(VectorSumCombiner.class);
    itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs));
    boolean succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    /* create A */
    Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    userRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = userRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //TODO this could be fiddled into one of the upper jobs
    Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
            AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageItemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf());

    numItems = averageRatings.getNumNondefaultElements();
    numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue();

    log.info("Found {} users and {} items", numUsers, numItems);

    /* create an initial M */
    initializeM(averageRatings);

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
        /* broadcast M, read A row-wise, recompute U row-wise */
        log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                currentIteration, "U", numItems);
        /* broadcast U, read A' row-wise, recompute M row-wise */
        log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration,
                "M", numUsers);
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelMRPJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*w  w  w  . j a v  a 2 s.c o m*/
    addOutputOption();

    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("lambda_a", null, "regularization parameter", true);
    addOption("lambda_fg", null, "regularization parameter", true);
    addOption("lambda_fu", null, "regularization parameter", true);
    addOption("lambda_lg", null, "regularization parameter", true);
    addOption("lambda_lu", null, "regularization parameter", true);
    addOption("lambda_g", null, "regularization parameter", true);
    addOption("lambda_u", null, "regularization parameter", true);
    addOption("dimFeatureUser", null, "dimension of user feautre", true);
    addOption("dimUserRelation", null, "dimension of geography feautre", true);
    addOption("userRelationPath", null, "geography features file", true);
    addOption("featureUserPath", null, "user features file", true);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    this.lambda_a = Double.parseDouble(getOption("lambda_a"));
    this.lambda_fg = Double.parseDouble(getOption("lambda_fg"));
    this.lambda_fu = Double.parseDouble(getOption("lambda_fu"));
    this.lambda_lg = Double.parseDouble(getOption("lambda_lg"));
    this.lambda_lu = Double.parseDouble(getOption("lambda_lu"));
    this.lambda_g = Double.parseDouble(getOption("lambda_g"));
    this.lambda_u = Double.parseDouble(getOption("lambda_u"));
    this.dimUserRelation = Integer.parseInt(getOption("dimUserRelation"));
    this.dimFeatureUser = Integer.parseInt(getOption("dimFeatureUser"));
    this.userRelationPath = getOption("userRelationPath");
    this.featureUserPath = getOption("featureUserPath");

    /* create A */
    Job userRatings = prepareJob(getInputPath(), pathToUserRatings(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    userRatings.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = userRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create A' */
    Job itemRatings = prepareJob(pathToUserRatings(), pathToItemRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    itemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create U ,this is a symetric matrix, so no need to compute U' */
    Job userRelation = prepareJob(getUserRelationPath(), pathToUserRelation(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    userRelation.setCombinerClass(VectorSumReducer.class);
    succeeded = userRelation.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu */
    Job featureUser = prepareJob(getFeatureUserPath(), pathToFeatureUser(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    featureUser.setCombinerClass(VectorSumReducer.class);
    succeeded = featureUser.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu' */
    Job featureUserTranspose = prepareJob(pathToFeatureUser(), pathToFeatureUserTranspose(),
            TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    featureUserTranspose.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = featureUserTranspose.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* Get some average values for initialization. */
    Job averageGeoFeatureValue = prepareJob(pathToUserRelation(), getTempPath("averageGeoFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageGeoFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageGeoFeatureTransposeValue = prepareJob(pathToUserRelationTranspose(),
            getTempPath("averageGeoFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageGeoFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureValue = prepareJob(pathToFeatureUser(), getTempPath("averageUserFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageUserFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureTransposeValue = prepareJob(pathToFeatureUserTranspose(),
            getTempPath("averageUserFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageUserFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Vector averageUserValue = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValue"), getConf());
    Vector averageGeoValue = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValue"), getConf());
    Vector averageUserValueTranspose = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValueTranspose"),
            getConf());
    Vector averageGeoValueTranspose = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValueTranspose"),
            getConf());

    /* create an initial Lu Lg */
    initialize(averageUserValue, pathToLu(-1));
    initialize(averageGeoValue, pathToLg(-1));
    initialize(averageUserValueTranspose, pathToU(-1));
    initialize(averageGeoValueTranspose, pathToG(-1));

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {

        /* broadcast Lu, read A' Fu' Fg', recompute Lg */
        log.info("Recompute Lg via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToLu(currentIteration - 1), pathToLgviaA(currentIteration),
                this.lambda_a, this.lambda_lg);
        /* broadcast G, read Fg row-wise, recompute Lg */
        log.info("Recompute Lg via Fg (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToG(currentIteration - 1), pathToUserRelation(), pathToLgviaF(currentIteration),
                this.lambda_fg, this.lambda_lg);
        /* merge Lg */
        log.info("Merge Lg together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLgviaA(currentIteration), pathToLgviaF(currentIteration), pathToLg(currentIteration));

        /* broadcast Lg, read A row-wise, recompute Lu */
        log.info("Recompute Lu via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToLg(currentIteration), pathToLuviaA(currentIteration),
                this.lambda_a, this.lambda_lu);
        /* broadcast U, read Fu row-wise, recompute Lu */
        log.info("Recompute Lu via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToU(currentIteration - 1), pathToFeatureUser(), pathToLuviaF(currentIteration),
                this.lambda_fu, this.lambda_lu);
        /* merge Lu */
        log.info("Merge Lu together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLuviaA(currentIteration), pathToLuviaF(currentIteration), pathToLu(currentIteration));

        /* broadcast Lg, read Fg, recompute G */
        log.info("Recompute G via Fg' (interation {}/{})", currentIteration, numIterations);
        runSolver(pathToLg(currentIteration), pathToUserRelationTranspose(), pathToG(currentIteration),
                this.lambda_fg, this.lambda_g);

        /* broadcast Lu, read Fu, recompute U */
        log.info("Recompute U via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToLu(currentIteration), pathToFeatureUserTranspose(), pathToU(currentIteration),
                this.lambda_fu, this.lambda_u);
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelMRPJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  www .ja  v  a  2 s.c o  m*/
    addOutputOption();

    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("lambda_a", null, "regularization parameter", true);
    addOption("lambda_fg", null, "regularization parameter", true);
    addOption("lambda_fu", null, "regularization parameter", true);
    addOption("lambda_lg", null, "regularization parameter", true);
    addOption("lambda_lu", null, "regularization parameter", true);
    addOption("lambda_g", null, "regularization parameter", true);
    addOption("lambda_u", null, "regularization parameter", true);
    addOption("dimFeatureUser", null, "dimension of user feautre", true);
    addOption("dimFeatureGeo", null, "dimension of geography feautre", true);
    addOption("featureGeoPath", null, "geography features file", true);
    addOption("featureUserPath", null, "user features file", true);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    this.lambda_a = Double.parseDouble(getOption("lambda_a"));
    this.lambda_fg = Double.parseDouble(getOption("lambda_fg"));
    this.lambda_fu = Double.parseDouble(getOption("lambda_fu"));
    this.lambda_lg = Double.parseDouble(getOption("lambda_lg"));
    this.lambda_lu = Double.parseDouble(getOption("lambda_lu"));
    this.lambda_g = Double.parseDouble(getOption("lambda_g"));
    this.lambda_u = Double.parseDouble(getOption("lambda_u"));
    this.dimFeatureGeo = Integer.parseInt(getOption("dimFeatureGeo"));
    this.dimFeatureUser = Integer.parseInt(getOption("dimFeatureUser"));
    this.featureGeoPath = getOption("featureGeoPath");
    this.featureUserPath = getOption("featureUserPath");

    /* create A */
    Job userRatings = prepareJob(getInputPath(), pathToUserRatings(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    userRatings.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = userRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create A' */
    Job itemRatings = prepareJob(pathToUserRatings(), pathToItemRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    itemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fg */
    Job featureGeo = prepareJob(getFeatureGeoPath(), pathToFeatureGeo(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    featureGeo.setCombinerClass(VectorSumReducer.class);
    succeeded = featureGeo.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fg' */
    Job featureGeoTranspose = prepareJob(pathToFeatureGeo(), pathToFeatureGeoTranspose(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    featureGeoTranspose.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = featureGeoTranspose.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu */
    Job featureUser = prepareJob(getFeatureUserPath(), pathToFeatureUser(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    featureUser.setCombinerClass(VectorSumReducer.class);
    succeeded = featureUser.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu' */
    Job featureUserTranspose = prepareJob(pathToFeatureUser(), pathToFeatureUserTranspose(),
            TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    featureUserTranspose.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = featureUserTranspose.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* Get some average values for initialization. */
    Job averageGeoFeatureValue = prepareJob(pathToFeatureGeo(), getTempPath("averageGeoFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageGeoFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageGeoFeatureTransposeValue = prepareJob(pathToFeatureGeoTranspose(),
            getTempPath("averageGeoFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageGeoFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureValue = prepareJob(pathToFeatureUser(), getTempPath("averageUserFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageUserFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureTransposeValue = prepareJob(pathToFeatureUserTranspose(),
            getTempPath("averageUserFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageUserFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Vector averageUserValue = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValue"), getConf());
    Vector averageGeoValue = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValue"), getConf());
    Vector averageUserValueTranspose = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValueTranspose"),
            getConf());
    Vector averageGeoValueTranspose = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValueTranspose"),
            getConf());

    /* create an initial Lu Lg */
    initialize(averageUserValue, pathToLu(-1));
    initialize(averageGeoValue, pathToLg(-1));
    initialize(averageUserValueTranspose, pathToU(-1));
    initialize(averageGeoValueTranspose, pathToG(-1));

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {

        /* broadcast Lu, read A' Fu' Fg', recompute Lg */
        log.info("Recompute Lg via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToLu(currentIteration - 1), pathToLgviaA(currentIteration),
                this.lambda_a, this.lambda_lg);
        /* broadcast G, read Fg row-wise, recompute Lg */
        log.info("Recompute Lg via Fg (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToG(currentIteration - 1), pathToFeatureGeo(), pathToLgviaF(currentIteration),
                this.lambda_fg, this.lambda_lg);
        /* merge Lg */
        log.info("Merge Lg together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLgviaA(currentIteration), pathToLgviaF(currentIteration), pathToLg(currentIteration));

        /* broadcast Lg, read A row-wise, recompute Lu */
        log.info("Recompute Lu via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToLg(currentIteration), pathToLuviaA(currentIteration),
                this.lambda_a, this.lambda_lu);
        /* broadcast U, read Fu row-wise, recompute Lu */
        log.info("Recompute Lu via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToU(currentIteration - 1), pathToFeatureUser(), pathToLuviaF(currentIteration),
                this.lambda_fu, this.lambda_lu);
        /* merge Lu */
        log.info("Merge Lu together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLuviaA(currentIteration), pathToLuviaF(currentIteration), pathToLu(currentIteration));

        /* broadcast Lg, read Fg, recompute G */
        log.info("Recompute G via Fg' (interation {}/{})", currentIteration, numIterations);
        runSolver(pathToLg(currentIteration), pathToFeatureGeoTranspose(), pathToG(currentIteration),
                this.lambda_fg, this.lambda_g);

        /* broadcast Lu, read Fu, recompute U */
        log.info("Recompute U via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToLu(currentIteration), pathToFeatureUserTranspose(), pathToU(currentIteration),
                this.lambda_fu, this.lambda_u);
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*  ww  w  .  ja va2  s .  c  o m*/
    addOutputOption();
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    float ratingShift = Float.parseFloat(getOption("ratingShift"));
    //convert items to an internal index
    Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
            ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
            VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
    itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
    boolean succeeded = itemIDIndex.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //convert user preferences into a vector per user
    Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class,
            ToItemPrefsMapper.class, VarLongWritable.class,
            booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorsReducer.class,
            VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
    toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
    toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));
    succeeded = toUserVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //we need the number of users later
    int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
            .getValue();
    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
    //build the rating matrix
    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

    succeeded = toItemVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hbase.preparation.PreparePreferenceMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();// w w w . ja  va 2s  . c  o  m
    addOutputOption();
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    float ratingShift = Float.parseFloat(getOption("ratingShift"));
    String workingTable = getConf().get(RecommenderJob.PARAM_WORKING_TABLE);
    String cfRatings = getConf().get(RecommenderJob.PARAM_CF_RATINGS);

    //convert items to an internal index
    Configuration mapred_config = HBaseConfiguration.create();
    mapred_config.setBoolean("mapred.compress.map.output", true);
    mapred_config.set(RecommenderJob.PARAM_CF_RATINGS, cfRatings);

    Job itemIDIndex = Job.getInstance(mapred_config);
    itemIDIndex.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), itemIDIndex,
            ItemIDIndexMapper.class, ItemIDIndexReducer.class));
    itemIDIndex.setJarByClass(ItemIDIndexMapper.class); // class that contains mapper and reducer

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(workingTable, // input table
            scan, // Scan instance to control CF and attribute selection
            ItemIDIndexMapper.class, // mapper class
            VarIntWritable.class, // mapper output key
            VarLongWritable.class, // mapper output value
            itemIDIndex);

    itemIDIndex.setReducerClass(ItemIDIndexReducer.class); // reducer class

    itemIDIndex.setOutputKeyClass(VarIntWritable.class);
    itemIDIndex.setOutputValueClass(VarLongWritable.class);
    itemIDIndex.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setOutputPath(itemIDIndex, getOutputPath(ITEMID_INDEX)); // adjust directories as required

    if (!itemIDIndex.waitForCompletion(true))
        return -1;
    //////////////////////////////////////////////////////////////////////////

    //convert user preferences into a vector per user
    mapred_config.setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
    mapred_config.setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
    mapred_config.set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));

    Job toUserVectors_hb = Job.getInstance(mapred_config);
    toUserVectors_hb.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), toUserVectors_hb,
            ToItemPrefsMapper.class, ToUserVectorsReducer.class));
    toUserVectors_hb.setJarByClass(ToItemPrefsMapper.class); // class that contains mapper and reducer

    TableMapReduceUtil.initTableMapperJob(workingTable, // input table
            scan, // Scan instance to control CF and attribute selection
            ToItemPrefsMapper.class, // mapper class
            VarLongWritable.class, // mapper output key
            booleanData ? VarLongWritable.class : EntityPrefWritable.class, // mapper output value
            toUserVectors_hb);

    toUserVectors_hb.setReducerClass(ToUserVectorsReducer.class); // reducer class
    toUserVectors_hb.setNumReduceTasks(1); // at least one, adjust as required

    toUserVectors_hb.setOutputKeyClass(VarLongWritable.class);
    toUserVectors_hb.setOutputValueClass(VectorWritable.class);
    toUserVectors_hb.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setOutputPath(toUserVectors_hb, getOutputPath(USER_VECTORS)); // adjust directories as required

    if (!toUserVectors_hb.waitForCompletion(true))
        return -1;
    //////////////////////////////////////////////////////////////////////////

    //we need the number of users later
    int numberOfUsers = (int) toUserVectors_hb.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
            .getValue();
    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
    //build the rating matrix
    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

    if (!toItemVectors.waitForCompletion(true))
        return -1;

    return 0;
}

From source file:org.apache.mahout.classifier.naivebayes.trainer.NaiveBayesTrainer.java

License:Apache License

private static void runNaiveBayesByLabelSummer(Path input, Configuration conf, Path labelMapPath, Path output,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    DistributedCache.setCacheFiles(new URI[] { labelMapPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName(//from  ww w .jav a  2s .  c om
            "Train Naive Bayes: input-folder: " + input + ", label-map-file: " + labelMapPath.toString());
    job.setJarByClass(NaiveBayesTrainer.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);
    job.setMapperClass(NaiveBayesInstanceMapper.class);
    job.setCombinerClass(NaiveBayesSumReducer.class);
    job.setReducerClass(NaiveBayesSumReducer.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setNumReduceTasks(numReducers);
    HadoopUtil.delete(conf, output);
    job.waitForCompletion(true);
}

From source file:org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from   ww  w.  j  a v  a2 s . c  o  m*/
    addOutputOption();
    addOption(LABELS, "l", "comma-separated list of labels to include in training", false);

    addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, ""));
    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
            buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
        HadoopUtil.delete(getConf(), getTempPath());
    }
    Path labPath;
    String labPathStr = getOption(LABEL_INDEX);
    if (labPathStr != null) {
        labPath = new Path(labPathStr);
    } else {
        labPath = getTempPath(LABEL_INDEX);
    }
    long labelSize = createLabelIndex(labPath);
    float alphaI = Float.parseFloat(getOption(ALPHA_I));
    boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY);

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());

    // Add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS),
            SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class,
            VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    // Sum up all the weights from the previous step, per label and per feature
    Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS),
            SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    // Put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());

    if (trainComplementary) {
        // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
        // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
        Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS),
                SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class,
                VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
        thetaSummer.setCombinerClass(VectorSumReducer.class);
        thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
        thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
        succeeded = thetaSummer.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    // Put the per label theta normalizers into the cache
    HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());

    // Validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(getOutputPath(), getConf());

    return 0;
}

From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java

License:Apache License

/**
 * Fintune using map/reduce./*ww w . j  a  v  a  2  s.  c  o  m*/
 *
 * @param batch the batch
 * @param iteration the iteration
 * @param learningrate the learningrate
 * @return true, if successful
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws InterruptedException the interrupted exception
 * @throws ClassNotFoundException the class not found exception
 */
private boolean fintuneMR(Path batch, int iteration, double learningrate)
        throws IOException, InterruptedException, ClassNotFoundException {
    //prepare and run finetune job
    long batchsize;
    HadoopUtil.delete(getConf(), getTempPath(WEIGHT_UPDATES));
    HadoopUtil.cacheFiles(getOutputPath(), getConf());

    Job trainDBM = prepareJob(batch, getTempPath(WEIGHT_UPDATES), SequenceFileInputFormat.class,
            DBMBackPropTrainingMapper.class, IntWritable.class, MatrixWritable.class,
            DBMBackPropTrainingReducer.class, IntWritable.class, MatrixWritable.class,
            SequenceFileOutputFormat.class);
    trainDBM.getConfiguration().set("labelcount", String.valueOf(labelcount));
    trainDBM.getConfiguration().set("learningrate", String.valueOf(learningrate));

    trainDBM.setCombinerClass(DBMBackPropTrainingReducer.class);

    if (!trainDBM.waitForCompletion(true))
        return false;

    batchsize = trainDBM.getCounters().findCounter(DBMBackPropTrainingMapper.BATCHES.SIZE).getValue();

    changeAndSaveModel(getOutputPath(), batchsize, (iteration == 0) ? 0 : momentum);
    return true;
}

From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java

License:Apache License

/**
 * Train greedy mr./* w  w  w .  j av  a 2  s  .c  o  m*/
 *
 * @param rbmNr the rbm nr
 * @param batch the batch
 * @param iteration the iteration
 * @param learningrate the learningrate
 * @return true, if successful
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws InterruptedException the interrupted exception
 * @throws ClassNotFoundException the class not found exception
 */
private boolean trainGreedyMR(int rbmNr, Path batch, int iteration, double learningrate)
        throws IOException, InterruptedException, ClassNotFoundException {
    //run greedy pretraining as map reduce job
    long batchsize;
    HadoopUtil.delete(getConf(), getTempPath(WEIGHT_UPDATES));
    HadoopUtil.cacheFiles(getOutputPath(), getConf());

    Job trainRBM = prepareJob(batch, getTempPath(WEIGHT_UPDATES), SequenceFileInputFormat.class,
            RBMGreedyPreTrainingMapper.class, IntWritable.class, MatrixWritable.class,
            RBMGreedyPreTrainingReducer.class, IntWritable.class, MatrixWritable.class,
            SequenceFileOutputFormat.class);
    trainRBM.getConfiguration().set("rbmNr", String.valueOf(rbmNr));
    trainRBM.getConfiguration().set("labelcount", String.valueOf(labelcount));
    trainRBM.getConfiguration().set("learningrate", String.valueOf(learningrate));
    trainRBM.getConfiguration().set("nrGibbsSampling", String.valueOf(nrGibbsSampling));

    trainRBM.setCombinerClass(RBMGreedyPreTrainingReducer.class);

    if (!trainRBM.waitForCompletion(true))
        return false;

    batchsize = trainRBM.getCounters().findCounter(RBMGreedyPreTrainingMapper.BATCH.SIZE).getValue();

    changeAndSaveModel(getOutputPath(), batchsize, (lastUpdate[rbmNr] == null) ? 0 : momentum);

    return true;
}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchDriver.java

License:Apache License

/**
 * Run one iteration of the Baum-Welch Map Reduce algorithm using the supplied arguments
 *
 * @param conf                the Configuration to use
 * @param input               the Path to the directory containing input
 * @param modelIn             the Path to the HmmModel
 * @param modelOut            the Path to the output directory
 * @param hiddenStateToIdMap  the Path to the map of hidden states to ids
 * @param emittedStateToIdMap the Path to the map of emitted states to ids
 * @param numHidden           the number of Hidden states
 * @param numObserved         the number of Observed states
 * @param scaling             name of the scaling method
 * @param delta               the convergence delta value
 * @return true or false depending on convergence check
 *//*w  ww . j  a  v  a 2s.  c  o  m*/

private static boolean runIteration(Configuration conf, Path input, Path modelIn, Path modelOut,
        Path hiddenStateToIdMap, Path emittedStateToIdMap, int numHidden, int numObserved, String scaling,
        String delta) throws IOException, InterruptedException, ClassNotFoundException {

    conf.set(BaumWelchConfigKeys.EMITTED_STATES_MAP_PATH, emittedStateToIdMap.toString());
    conf.set(BaumWelchConfigKeys.HIDDEN_STATES_MAP_PATH, hiddenStateToIdMap.toString());
    conf.set(BaumWelchConfigKeys.SCALING_OPTION_KEY, scaling);
    conf.set(BaumWelchConfigKeys.MODEL_PATH_KEY, modelIn.toString());
    conf.set(BaumWelchConfigKeys.NUMBER_OF_HIDDEN_STATES_KEY, ((Integer) numHidden).toString());
    conf.set(BaumWelchConfigKeys.NUMBER_OF_EMITTED_STATES_KEY, ((Integer) numObserved).toString());
    conf.set(BaumWelchConfigKeys.MODEL_CONVERGENCE_KEY, delta);

    Job job = new Job(conf, "Baum-Welch Driver running runIteration over modelIn: "
            + conf.get(BaumWelchConfigKeys.MODEL_PATH_KEY));
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(MapWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MapWritable.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(BaumWelchMapper.class);
    job.setCombinerClass(BaumWelchCombiner.class);
    job.setReducerClass(BaumWelchReducer.class);

    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, modelOut);

    job.setJarByClass(BaumWelchDriver.class);
    HadoopUtil.delete(conf, modelOut);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Baum-Welch Iteration failed processing " + modelIn);
    }

    return isConverged(modelIn, modelOut, numHidden, numObserved, conf);
}