Example usage for org.apache.mahout.cf.taste.hadoop.item RecommenderJob BOOLEAN

Introduction

In this page you can find the example usage for org.apache.mahout.cf.taste.hadoop.item RecommenderJob BOOLEAN_DATA.

Prototype

String BOOLEAN_DATA

To view the source code for org.apache.mahout.cf.taste.hadoop.item RecommenderJob BOOLEAN_DATA.

Click Source Link

Usage

From source file:com.pocketx.gravity.recommender.cf.similarity.job.PreparePreferenceMatrixJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from  ww  w . j a  va 2s.c  om
    addOutputOption();
    addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, "
            + "users with more preferences will be sampled down");
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    float ratingShift = Float.parseFloat(getOption("ratingShift"));
    //convert items to an internal index
    Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
            ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
            VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
    itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
    boolean succeeded = itemIDIndex.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //convert user preferences into a vector per user
    Job toUserVectors = prepareJob(getInputPath(), getOutputPath(USER_VECTORS), TextInputFormat.class,
            ToItemPrefsMapper.class, VarLongWritable.class,
            booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorsReducer.class,
            VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
    toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
    toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));
    succeeded = toUserVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //we need the number of users later
    int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
            .getValue();
    HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
    //build the rating matrix
    Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
            ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

    /* configure sampling regarding the uservectors */
    if (hasOption("maxPrefsPerUser")) {
        int samplingSize = Integer.parseInt(getOption("maxPrefsPerUser"));
        toItemVectors.getConfiguration().setInt(ToItemVectorsMapper.SAMPLE_SIZE, samplingSize);
    }

    succeeded = toItemVectors.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    return 0;
}

From source file:nl.gridline.zieook.inx.movielens.AggregateAndRecommendReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException {
    Configuration jobConf = context.getConfiguration();
    recommendationsPerUser = jobConf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS);
    booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
    indexItemIDMap = TasteHadoopUtils.readItemIDIndexMap(jobConf.get(ITEMID_INDEX_PATH), jobConf);

    FSDataInputStream in = null;/*from   www.ja v  a 2 s . com*/
    try {
        String itemFilePathString = jobConf.get(ITEMS_FILE);
        if (itemFilePathString == null) {
            itemsToRecommendFor = null;
        } else {
            Path unqualifiedItemsFilePath = new Path(itemFilePathString);
            FileSystem fs = FileSystem.get(unqualifiedItemsFilePath.toUri(), jobConf);
            itemsToRecommendFor = new FastIDSet();
            Path itemsFilePath = unqualifiedItemsFilePath.makeQualified(fs);
            in = fs.open(itemsFilePath);
            for (String line : new FileLineIterable(in)) {
                itemsToRecommendFor.add(Long.parseLong(line));
            }
        }
    } finally {
        IOUtils.closeStream(in);
    }
}

From source file:nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java

License:Apache License

@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    addInputOption();/*ww  w . j a v  a2s  .c  o  m*/

    // addOutputOption(); // no output path, we use a table!
    addOption("outputtable", "ot", "Output table name");

    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this number " + "(default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    // Path outputPath = getOutputPath();
    String outputTable = parsedArgs.get("--outputtable");
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /*
     * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...)
     */
    try {
        ToolRunner.run(getConf(), new RowSimilarityZieOok(),
                new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath,
                        "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns",
                        String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName,
                        "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir",
                        tempDirPath.toString() });
    } catch (Exception e) {
        throw new IllegalStateException("item-item-similarity computation failed", e);
    }

    // This step writes the data to a file, we don't want that, it should be written in HBase directly:
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable);

        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();

        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);

        // mostSimilarItems.waitForCompletion(true);

        task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled());

        // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class,
        // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
        // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
        // TextOutputFormat.class);
        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
        // mostSimilarItems.waitForCompletion(true);
    }

    return 0;
}

From source file:semvec.mahout.UserItemPrefMapper.java

License:Apache License

@Override
protected void setup(Context context) {
    Configuration jobConf = context.getConfiguration();
    booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
    transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEM, false);
    String d = jobConf.get(LSHDriver.DIMENSION);
    String r = jobConf.get(LSHDriver.RANDOMSEED);
    Random seedbase = new Random();
    if (null == d)
        dimension = 2;//from ww w. j ava2s  . c  o m
    else
        dimension = Integer.parseInt(d);
    if (null != r) {
        seedbase = new Random(Integer.parseInt(r));
    }
    userRandom = new Random[dimension];
    for (int dim = 0; dim < dimension; dim++)
        userRandom[dim] = new Random(seedbase.nextLong());
    itemRandom = new Random[dimension];
    for (int dim = 0; dim < dimension; dim++)
        itemRandom[dim] = new Random(seedbase.nextLong());
    invertPyramid = new double[dimension];

}

Example usage for org.apache.mahout.cf.taste.hadoop.item RecommenderJob BOOLEAN_DATA

Introduction

Prototype

Usage