Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  w w w.j  a v a  2 s  .  co  m*/
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1));
    addOption("usesLongIDs", null, "input contains long IDs that need to be translated");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    lambda = Double.parseDouble(getOption("lambda"));
    alpha = Double.parseDouble(getOption("alpha"));
    implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));

    numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver"));
    usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false)));

    /*
    * compute the factorization A = U M'
    *
    * where A (users x items) is the matrix of known ratings
    *           U (users x features) is the representation of users in the feature space
    *           M (items x features) is the representation of items in the feature space
    */

    if (usesLongIDs) {
        Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS));
        mapUsers.waitForCompletion(true);

        Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class,
                MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
                VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
        mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS));
        mapItems.waitForCompletion(true);
    }

    /* create A' */
    Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class,
            ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    itemRatings.setCombinerClass(VectorSumCombiner.class);
    itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs));
    boolean succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    /* create A */
    Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    userRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = userRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //TODO this could be fiddled into one of the upper jobs
    Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
            AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageItemRatings.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf());

    numItems = averageRatings.getNumNondefaultElements();
    numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue();

    log.info("Found {} users and {} items", numUsers, numItems);

    /* create an initial M */
    initializeM(averageRatings);

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
        /* broadcast M, read A row-wise, recompute U row-wise */
        log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                currentIteration, "U", numItems);
        /* broadcast U, read A' row-wise, recompute M row-wise */
        log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration,
                "M", numUsers);
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelMRPJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*w  w  w  . j a v  a 2 s.c o m*/
    addOutputOption();

    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("lambda_a", null, "regularization parameter", true);
    addOption("lambda_fg", null, "regularization parameter", true);
    addOption("lambda_fu", null, "regularization parameter", true);
    addOption("lambda_lg", null, "regularization parameter", true);
    addOption("lambda_lu", null, "regularization parameter", true);
    addOption("lambda_g", null, "regularization parameter", true);
    addOption("lambda_u", null, "regularization parameter", true);
    addOption("dimFeatureUser", null, "dimension of user feautre", true);
    addOption("dimUserRelation", null, "dimension of geography feautre", true);
    addOption("userRelationPath", null, "geography features file", true);
    addOption("featureUserPath", null, "user features file", true);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    this.lambda_a = Double.parseDouble(getOption("lambda_a"));
    this.lambda_fg = Double.parseDouble(getOption("lambda_fg"));
    this.lambda_fu = Double.parseDouble(getOption("lambda_fu"));
    this.lambda_lg = Double.parseDouble(getOption("lambda_lg"));
    this.lambda_lu = Double.parseDouble(getOption("lambda_lu"));
    this.lambda_g = Double.parseDouble(getOption("lambda_g"));
    this.lambda_u = Double.parseDouble(getOption("lambda_u"));
    this.dimUserRelation = Integer.parseInt(getOption("dimUserRelation"));
    this.dimFeatureUser = Integer.parseInt(getOption("dimFeatureUser"));
    this.userRelationPath = getOption("userRelationPath");
    this.featureUserPath = getOption("featureUserPath");

    /* create A */
    Job userRatings = prepareJob(getInputPath(), pathToUserRatings(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    userRatings.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = userRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create A' */
    Job itemRatings = prepareJob(pathToUserRatings(), pathToItemRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    itemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create U ,this is a symetric matrix, so no need to compute U' */
    Job userRelation = prepareJob(getUserRelationPath(), pathToUserRelation(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    userRelation.setCombinerClass(VectorSumReducer.class);
    succeeded = userRelation.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu */
    Job featureUser = prepareJob(getFeatureUserPath(), pathToFeatureUser(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    featureUser.setCombinerClass(VectorSumReducer.class);
    succeeded = featureUser.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu' */
    Job featureUserTranspose = prepareJob(pathToFeatureUser(), pathToFeatureUserTranspose(),
            TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    featureUserTranspose.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = featureUserTranspose.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* Get some average values for initialization. */
    Job averageGeoFeatureValue = prepareJob(pathToUserRelation(), getTempPath("averageGeoFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageGeoFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageGeoFeatureTransposeValue = prepareJob(pathToUserRelationTranspose(),
            getTempPath("averageGeoFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageGeoFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureValue = prepareJob(pathToFeatureUser(), getTempPath("averageUserFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageUserFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureTransposeValue = prepareJob(pathToFeatureUserTranspose(),
            getTempPath("averageUserFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageUserFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Vector averageUserValue = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValue"), getConf());
    Vector averageGeoValue = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValue"), getConf());
    Vector averageUserValueTranspose = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValueTranspose"),
            getConf());
    Vector averageGeoValueTranspose = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValueTranspose"),
            getConf());

    /* create an initial Lu Lg */
    initialize(averageUserValue, pathToLu(-1));
    initialize(averageGeoValue, pathToLg(-1));
    initialize(averageUserValueTranspose, pathToU(-1));
    initialize(averageGeoValueTranspose, pathToG(-1));

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {

        /* broadcast Lu, read A' Fu' Fg', recompute Lg */
        log.info("Recompute Lg via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToLu(currentIteration - 1), pathToLgviaA(currentIteration),
                this.lambda_a, this.lambda_lg);
        /* broadcast G, read Fg row-wise, recompute Lg */
        log.info("Recompute Lg via Fg (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToG(currentIteration - 1), pathToUserRelation(), pathToLgviaF(currentIteration),
                this.lambda_fg, this.lambda_lg);
        /* merge Lg */
        log.info("Merge Lg together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLgviaA(currentIteration), pathToLgviaF(currentIteration), pathToLg(currentIteration));

        /* broadcast Lg, read A row-wise, recompute Lu */
        log.info("Recompute Lu via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToLg(currentIteration), pathToLuviaA(currentIteration),
                this.lambda_a, this.lambda_lu);
        /* broadcast U, read Fu row-wise, recompute Lu */
        log.info("Recompute Lu via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToU(currentIteration - 1), pathToFeatureUser(), pathToLuviaF(currentIteration),
                this.lambda_fu, this.lambda_lu);
        /* merge Lu */
        log.info("Merge Lu together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLuviaA(currentIteration), pathToLuviaF(currentIteration), pathToLu(currentIteration));

        /* broadcast Lg, read Fg, recompute G */
        log.info("Recompute G via Fg' (interation {}/{})", currentIteration, numIterations);
        runSolver(pathToLg(currentIteration), pathToUserRelationTranspose(), pathToG(currentIteration),
                this.lambda_fg, this.lambda_g);

        /* broadcast Lu, read Fu, recompute U */
        log.info("Recompute U via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToLu(currentIteration), pathToFeatureUserTranspose(), pathToU(currentIteration),
                this.lambda_fu, this.lambda_u);
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.als.ParallelMRPJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  www .ja  v  a  2 s.c o  m*/
    addOutputOption();

    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("lambda_a", null, "regularization parameter", true);
    addOption("lambda_fg", null, "regularization parameter", true);
    addOption("lambda_fu", null, "regularization parameter", true);
    addOption("lambda_lg", null, "regularization parameter", true);
    addOption("lambda_lu", null, "regularization parameter", true);
    addOption("lambda_g", null, "regularization parameter", true);
    addOption("lambda_u", null, "regularization parameter", true);
    addOption("dimFeatureUser", null, "dimension of user feautre", true);
    addOption("dimFeatureGeo", null, "dimension of geography feautre", true);
    addOption("featureGeoPath", null, "geography features file", true);
    addOption("featureUserPath", null, "user features file", true);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(getOption("numFeatures"));
    numIterations = Integer.parseInt(getOption("numIterations"));
    this.lambda_a = Double.parseDouble(getOption("lambda_a"));
    this.lambda_fg = Double.parseDouble(getOption("lambda_fg"));
    this.lambda_fu = Double.parseDouble(getOption("lambda_fu"));
    this.lambda_lg = Double.parseDouble(getOption("lambda_lg"));
    this.lambda_lu = Double.parseDouble(getOption("lambda_lu"));
    this.lambda_g = Double.parseDouble(getOption("lambda_g"));
    this.lambda_u = Double.parseDouble(getOption("lambda_u"));
    this.dimFeatureGeo = Integer.parseInt(getOption("dimFeatureGeo"));
    this.dimFeatureUser = Integer.parseInt(getOption("dimFeatureUser"));
    this.featureGeoPath = getOption("featureGeoPath");
    this.featureUserPath = getOption("featureUserPath");

    /* create A */
    Job userRatings = prepareJob(getInputPath(), pathToUserRatings(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    userRatings.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = userRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create A' */
    Job itemRatings = prepareJob(pathToUserRatings(), pathToItemRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    itemRatings.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = itemRatings.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fg */
    Job featureGeo = prepareJob(getFeatureGeoPath(), pathToFeatureGeo(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    featureGeo.setCombinerClass(VectorSumReducer.class);
    succeeded = featureGeo.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fg' */
    Job featureGeoTranspose = prepareJob(pathToFeatureGeo(), pathToFeatureGeoTranspose(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    featureGeoTranspose.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = featureGeoTranspose.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu */
    Job featureUser = prepareJob(getFeatureUserPath(), pathToFeatureUser(), TextInputFormat.class,
            InputVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    featureUser.setCombinerClass(VectorSumReducer.class);
    succeeded = featureUser.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* create Fu' */
    Job featureUserTranspose = prepareJob(pathToFeatureUser(), pathToFeatureUserTranspose(),
            TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    featureUserTranspose.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = featureUserTranspose.waitForCompletion(true);
    if (!succeeded)
        return -1;

    /* Get some average values for initialization. */
    Job averageGeoFeatureValue = prepareJob(pathToFeatureGeo(), getTempPath("averageGeoFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageGeoFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageGeoFeatureTransposeValue = prepareJob(pathToFeatureGeoTranspose(),
            getTempPath("averageGeoFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageGeoFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageGeoFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureValue = prepareJob(pathToFeatureUser(), getTempPath("averageUserFeatureValue"),
            AverageVectorsMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageUserFeatureValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Job averageUserFeatureTransposeValue = prepareJob(pathToFeatureUserTranspose(),
            getTempPath("averageUserFeatureValueTranspose"), AverageVectorsMapper.class, IntWritable.class,
            VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
    averageUserFeatureTransposeValue.setCombinerClass(MergeVectorsCombiner.class);
    succeeded = averageUserFeatureTransposeValue.waitForCompletion(true);
    if (!succeeded)
        return -1;

    Vector averageUserValue = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValue"), getConf());
    Vector averageGeoValue = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValue"), getConf());
    Vector averageUserValueTranspose = ALSUtils.readFirstRow(getTempPath("averageUserFeatureValueTranspose"),
            getConf());
    Vector averageGeoValueTranspose = ALSUtils.readFirstRow(getTempPath("averageGeoFeatureValueTranspose"),
            getConf());

    /* create an initial Lu Lg */
    initialize(averageUserValue, pathToLu(-1));
    initialize(averageGeoValue, pathToLg(-1));
    initialize(averageUserValueTranspose, pathToU(-1));
    initialize(averageGeoValueTranspose, pathToG(-1));

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {

        /* broadcast Lu, read A' Fu' Fg', recompute Lg */
        log.info("Recompute Lg via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToItemRatings(), pathToLu(currentIteration - 1), pathToLgviaA(currentIteration),
                this.lambda_a, this.lambda_lg);
        /* broadcast G, read Fg row-wise, recompute Lg */
        log.info("Recompute Lg via Fg (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToG(currentIteration - 1), pathToFeatureGeo(), pathToLgviaF(currentIteration),
                this.lambda_fg, this.lambda_lg);
        /* merge Lg */
        log.info("Merge Lg together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLgviaA(currentIteration), pathToLgviaF(currentIteration), pathToLg(currentIteration));

        /* broadcast Lg, read A row-wise, recompute Lu */
        log.info("Recompute Lu via A (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToLg(currentIteration), pathToLuviaA(currentIteration),
                this.lambda_a, this.lambda_lu);
        /* broadcast U, read Fu row-wise, recompute Lu */
        log.info("Recompute Lu via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToU(currentIteration - 1), pathToFeatureUser(), pathToLuviaF(currentIteration),
                this.lambda_fu, this.lambda_lu);
        /* merge Lu */
        log.info("Merge Lu together (iteration {}/{})", currentIteration, numIterations);
        mergeLuorLg(pathToLuviaA(currentIteration), pathToLuviaF(currentIteration), pathToLu(currentIteration));

        /* broadcast Lg, read Fg, recompute G */
        log.info("Recompute G via Fg' (interation {}/{})", currentIteration, numIterations);
        runSolver(pathToLg(currentIteration), pathToFeatureGeoTranspose(), pathToG(currentIteration),
                this.lambda_fg, this.lambda_g);

        /* broadcast Lu, read Fu, recompute U */
        log.info("Recompute U via Fu (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToLu(currentIteration), pathToFeatureUserTranspose(), pathToU(currentIteration),
                this.lambda_fu, this.lambda_u);
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*  ww  w  .  ja va2  s .  c  o m*/
    addOutputOption();
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    float ratingShift = Float.parseFloat(getOption("ratingShift"));
    //convert items to an internal index
    Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
            ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
            VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
    itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
    boolean succeeded = itemIDIndex.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //convert user preferences into a vector per user
    Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class,
            ToItemPrefsMapper.class, VarLongWritable.class,
            booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorsReducer.class,
            VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
    toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
    toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));
    succeeded = toUserVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //we need the number of users later
    int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
            .getValue();
    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
    //build the rating matrix
    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

    succeeded = toItemVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    return 0;
}

From source file:org.apache.mahout.cf.taste.hbase.preparation.PreparePreferenceMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();// w w w . ja  va 2s  . c  o  m
    addOutputOption();
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    float ratingShift = Float.parseFloat(getOption("ratingShift"));
    String workingTable = getConf().get(RecommenderJob.PARAM_WORKING_TABLE);
    String cfRatings = getConf().get(RecommenderJob.PARAM_CF_RATINGS);

    //convert items to an internal index
    Configuration mapred_config = HBaseConfiguration.create();
    mapred_config.setBoolean("mapred.compress.map.output", true);
    mapred_config.set(RecommenderJob.PARAM_CF_RATINGS, cfRatings);

    Job itemIDIndex = Job.getInstance(mapred_config);
    itemIDIndex.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), itemIDIndex,
            ItemIDIndexMapper.class, ItemIDIndexReducer.class));
    itemIDIndex.setJarByClass(ItemIDIndexMapper.class); // class that contains mapper and reducer

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(workingTable, // input table
            scan, // Scan instance to control CF and attribute selection
            ItemIDIndexMapper.class, // mapper class
            VarIntWritable.class, // mapper output key
            VarLongWritable.class, // mapper output value
            itemIDIndex);

    itemIDIndex.setReducerClass(ItemIDIndexReducer.class); // reducer class

    itemIDIndex.setOutputKeyClass(VarIntWritable.class);
    itemIDIndex.setOutputValueClass(VarLongWritable.class);
    itemIDIndex.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setOutputPath(itemIDIndex, getOutputPath(ITEMID_INDEX)); // adjust directories as required

    if (!itemIDIndex.waitForCompletion(true))
        return -1;
    //////////////////////////////////////////////////////////////////////////

    //convert user preferences into a vector per user
    mapred_config.setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
    mapred_config.setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
    mapred_config.set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));

    Job toUserVectors_hb = Job.getInstance(mapred_config);
    toUserVectors_hb.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), toUserVectors_hb,
            ToItemPrefsMapper.class, ToUserVectorsReducer.class));
    toUserVectors_hb.setJarByClass(ToItemPrefsMapper.class); // class that contains mapper and reducer

    TableMapReduceUtil.initTableMapperJob(workingTable, // input table
            scan, // Scan instance to control CF and attribute selection
            ToItemPrefsMapper.class, // mapper class
            VarLongWritable.class, // mapper output key
            booleanData ? VarLongWritable.class : EntityPrefWritable.class, // mapper output value
            toUserVectors_hb);

    toUserVectors_hb.setReducerClass(ToUserVectorsReducer.class); // reducer class
    toUserVectors_hb.setNumReduceTasks(1); // at least one, adjust as required

    toUserVectors_hb.setOutputKeyClass(VarLongWritable.class);
    toUserVectors_hb.setOutputValueClass(VectorWritable.class);
    toUserVectors_hb.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setOutputPath(toUserVectors_hb, getOutputPath(USER_VECTORS)); // adjust directories as required

    if (!toUserVectors_hb.waitForCompletion(true))
        return -1;
    //////////////////////////////////////////////////////////////////////////

    //we need the number of users later
    int numberOfUsers = (int) toUserVectors_hb.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
            .getValue();
    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
    //build the rating matrix
    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

    if (!toItemVectors.waitForCompletion(true))
        return -1;

    return 0;
}

From source file:org.apache.mahout.classifier.naivebayes.trainer.NaiveBayesTrainer.java

License:Apache License

private static void runNaiveBayesByLabelSummer(Path input, Configuration conf, Path labelMapPath, Path output,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    DistributedCache.setCacheFiles(new URI[] { labelMapPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName(//from  ww w .jav a  2s .  c om
            "Train Naive Bayes: input-folder: " + input + ", label-map-file: " + labelMapPath.toString());
    job.setJarByClass(NaiveBayesTrainer.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);
    job.setMapperClass(NaiveBayesInstanceMapper.class);
    job.setCombinerClass(NaiveBayesSumReducer.class);
    job.setReducerClass(NaiveBayesSumReducer.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setNumReduceTasks(numReducers);
    HadoopUtil.delete(conf, output);
    job.waitForCompletion(true);
}

From source file:org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from   ww  w.  j  a v  a2 s . c  o  m*/
    addOutputOption();
    addOption(LABELS, "l", "comma-separated list of labels to include in training", false);

    addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, ""));
    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
            buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
        HadoopUtil.delete(getConf(), getTempPath());
    }
    Path labPath;
    String labPathStr = getOption(LABEL_INDEX);
    if (labPathStr != null) {
        labPath = new Path(labPathStr);
    } else {
        labPath = getTempPath(LABEL_INDEX);
    }
    long labelSize = createLabelIndex(labPath);
    float alphaI = Float.parseFloat(getOption(ALPHA_I));
    boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY);

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());

    // Add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS),
            SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class,
            VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    // Sum up all the weights from the previous step, per label and per feature
    Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS),
            SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    // Put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());

    if (trainComplementary) {
        // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
        // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
        Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS),
                SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class,
                VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
        thetaSummer.setCombinerClass(VectorSumReducer.class);
        thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
        thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
        succeeded = thetaSummer.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    // Put the per label theta normalizers into the cache
    HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());

    // Validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(getOutputPath(), getConf());

    return 0;
}

From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java

License:Apache License

/**
 * Fintune using map/reduce./*ww w . j  a  v  a  2  s.  c  o  m*/
 *
 * @param batch the batch
 * @param iteration the iteration
 * @param learningrate the learningrate
 * @return true, if successful
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws InterruptedException the interrupted exception
 * @throws ClassNotFoundException the class not found exception
 */
private boolean fintuneMR(Path batch, int iteration, double learningrate)
        throws IOException, InterruptedException, ClassNotFoundException {
    //prepare and run finetune job
    long batchsize;
    HadoopUtil.delete(getConf(), getTempPath(WEIGHT_UPDATES));
    HadoopUtil.cacheFiles(getOutputPath(), getConf());

    Job trainDBM = prepareJob(batch, getTempPath(WEIGHT_UPDATES), SequenceFileInputFormat.class,
            DBMBackPropTrainingMapper.class, IntWritable.class, MatrixWritable.class,
            DBMBackPropTrainingReducer.class, IntWritable.class, MatrixWritable.class,
            SequenceFileOutputFormat.class);
    trainDBM.getConfiguration().set("labelcount", String.valueOf(labelcount));
    trainDBM.getConfiguration().set("learningrate", String.valueOf(learningrate));

    trainDBM.setCombinerClass(DBMBackPropTrainingReducer.class);

    if (!trainDBM.waitForCompletion(true))
        return false;

    batchsize = trainDBM.getCounters().findCounter(DBMBackPropTrainingMapper.BATCHES.SIZE).getValue();

    changeAndSaveModel(getOutputPath(), batchsize, (iteration == 0) ? 0 : momentum);
    return true;
}

From source file:org.apache.mahout.classifier.rbm.training.RBMClassifierTrainingJob.java

License:Apache License

/**
 * Train greedy mr./* w  w  w .  j av  a 2  s  .c  o  m*/
 *
 * @param rbmNr the rbm nr
 * @param batch the batch
 * @param iteration the iteration
 * @param learningrate the learningrate
 * @return true, if successful
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws InterruptedException the interrupted exception
 * @throws ClassNotFoundException the class not found exception
 */
private boolean trainGreedyMR(int rbmNr, Path batch, int iteration, double learningrate)
        throws IOException, InterruptedException, ClassNotFoundException {
    //run greedy pretraining as map reduce job
    long batchsize;
    HadoopUtil.delete(getConf(), getTempPath(WEIGHT_UPDATES));
    HadoopUtil.cacheFiles(getOutputPath(), getConf());

    Job trainRBM = prepareJob(batch, getTempPath(WEIGHT_UPDATES), SequenceFileInputFormat.class,
            RBMGreedyPreTrainingMapper.class, IntWritable.class, MatrixWritable.class,
            RBMGreedyPreTrainingReducer.class, IntWritable.class, MatrixWritable.class,
            SequenceFileOutputFormat.class);
    trainRBM.getConfiguration().set("rbmNr", String.valueOf(rbmNr));
    trainRBM.getConfiguration().set("labelcount", String.valueOf(labelcount));
    trainRBM.getConfiguration().set("learningrate", String.valueOf(learningrate));
    trainRBM.getConfiguration().set("nrGibbsSampling", String.valueOf(nrGibbsSampling));

    trainRBM.setCombinerClass(RBMGreedyPreTrainingReducer.class);

    if (!trainRBM.waitForCompletion(true))
        return false;

    batchsize = trainRBM.getCounters().findCounter(RBMGreedyPreTrainingMapper.BATCH.SIZE).getValue();

    changeAndSaveModel(getOutputPath(), batchsize, (lastUpdate[rbmNr] == null) ? 0 : momentum);

    return true;
}

From source file:org.apache.mahout.classifier.sequencelearning.hmm.hadoop.BaumWelchDriver.java

License:Apache License

/**
 * Run one iteration of the Baum-Welch Map Reduce algorithm using the supplied arguments
 *
 * @param conf                the Configuration to use
 * @param input               the Path to the directory containing input
 * @param modelIn             the Path to the HmmModel
 * @param modelOut            the Path to the output directory
 * @param hiddenStateToIdMap  the Path to the map of hidden states to ids
 * @param emittedStateToIdMap the Path to the map of emitted states to ids
 * @param numHidden           the number of Hidden states
 * @param numObserved         the number of Observed states
 * @param scaling             name of the scaling method
 * @param delta               the convergence delta value
 * @return true or false depending on convergence check
 *//*w  ww . j  a  v  a 2s.  c  o  m*/

private static boolean runIteration(Configuration conf, Path input, Path modelIn, Path modelOut,
        Path hiddenStateToIdMap, Path emittedStateToIdMap, int numHidden, int numObserved, String scaling,
        String delta) throws IOException, InterruptedException, ClassNotFoundException {

    conf.set(BaumWelchConfigKeys.EMITTED_STATES_MAP_PATH, emittedStateToIdMap.toString());
    conf.set(BaumWelchConfigKeys.HIDDEN_STATES_MAP_PATH, hiddenStateToIdMap.toString());
    conf.set(BaumWelchConfigKeys.SCALING_OPTION_KEY, scaling);
    conf.set(BaumWelchConfigKeys.MODEL_PATH_KEY, modelIn.toString());
    conf.set(BaumWelchConfigKeys.NUMBER_OF_HIDDEN_STATES_KEY, ((Integer) numHidden).toString());
    conf.set(BaumWelchConfigKeys.NUMBER_OF_EMITTED_STATES_KEY, ((Integer) numObserved).toString());
    conf.set(BaumWelchConfigKeys.MODEL_CONVERGENCE_KEY, delta);

    Job job = new Job(conf, "Baum-Welch Driver running runIteration over modelIn: "
            + conf.get(BaumWelchConfigKeys.MODEL_PATH_KEY));
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(MapWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MapWritable.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(BaumWelchMapper.class);
    job.setCombinerClass(BaumWelchCombiner.class);
    job.setReducerClass(BaumWelchReducer.class);

    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, modelOut);

    job.setJarByClass(BaumWelchDriver.class);
    HadoopUtil.delete(conf, modelOut);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Baum-Welch Iteration failed processing " + modelIn);
    }

    return isConverged(modelIn, modelOut, numHidden, numObserved, conf);
}