Example usage for org.apache.mahout.common HadoopUtil countRecords

List of usage examples for org.apache.mahout.common HadoopUtil countRecords

Introduction

In this page you can find the example usage for org.apache.mahout.common HadoopUtil countRecords.

Prototype

public static long countRecords(Path path, PathType pt, PathFilter filter, Configuration conf)
        throws IOException 

Source Link

Document

Count all the records in a directory using a org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator

Usage

From source file:org.gpfvic.mahout.cf.taste.hadoop.item.RecommenderJob.java

License:Apache License

public int run(String[] args) throws Exception {

    addInputOption();//from   w  w w  .  ja v a 2 s  .  c om
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", null, "File of users to recommend for", null);
    addOption("itemsFile", null, "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("userItemFile", "uif",
            "File containing comma-separated userID,itemID pairs (optional). "
                    + "Used to include only these items into recommendations. "
                    + "Cannot be used together with usersFile or itemsFile",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mxp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxPrefsInItemSimilarity", "mpiis",
            "max number of preferences to consider per user or item in the "
                    + "item similarity computation phase, users or items with more preferences will be sampled down (default: "
                    + DEFAULT_MAX_PREFS + ')',
            String.valueOf(DEFAULT_MAX_PREFS));
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')',
            true);
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
    addOption("outputPathForSimilarityMatrix", "opfsm",
            "write the item similarity matrix to this path (optional)", false);
    addOption("randomSeed", null, "use this seed for sampling", false);
    addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path outputPath = getOutputPath();
    int numRecommendations = Integer.parseInt(getOption("numRecommendations"));
    String usersFile = getOption("usersFile");
    String itemsFile = getOption("itemsFile");
    String filterFile = getOption("filterFile");
    String userItemFile = getOption("userItemFile");
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity"));
    int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    String similarityClassname = getOption("similarityClassname");
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed"))
            : RowSimilarityJob.NO_FIXED_RANDOM_SEED;

    Path prepPath = getTempPath(DEFAULT_PREPARE_PATH);
    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path explicitFilterPath = getTempPath("explicitFilterPath");
    Path partialMultiplyPath = getTempPath("partialMultiply");

    AtomicInteger currentPhase = new AtomicInteger();

    int numberOfUsers = -1;

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData",
                        String.valueOf(booleanData), "--tempDir", getTempPath().toString(), });

        numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /* special behavior if phase 1 is skipped */
        if (numberOfUsers == -1) {
            numberOfUsers = (int) HadoopUtil.countRecords(
                    new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null,
                    getConf());
        }

        //calculate the co-occurrence matrix
        ToolRunner.run(getConf(), new RowSimilarityJob(),
                new String[] { "--input",
                        new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                        similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                        "--similarityClassname", similarityClassname, "--maxObservationsPerRow",
                        String.valueOf(maxPrefsInItemSimilarity), "--maxObservationsPerColumn",
                        String.valueOf(maxPrefsInItemSimilarity), "--maxSimilaritiesPerRow",
                        String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity",
                        String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed",
                        String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), });

        // write out the similarity matrix if the user specified that behavior
        if (hasOption("outputPathForSimilarityMatrix")) {
            Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));

            Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,
                    SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
                    EntityEntityWritable.class, DoubleWritable.class,
                    ItemSimilarityJob.MostSimilarItemPairsReducer.class, EntityEntityWritable.class,
                    DoubleWritable.class, TextOutputFormat.class);

            Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
            mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
                    new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
            mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
            outputSimilarityMatrix.waitForCompletion(true);
        }
    }

    //start the multiplication of the co-occurrence matrix by the user vectors
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job partialMultiply = new Job(getConf(), "partialMultiply");
        Configuration partialMultiplyConf = partialMultiply.getConfiguration();

        MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,
                SimilarityMatrixRowWrapperMapper.class);
        MultipleInputs.addInputPath(partialMultiply,
                new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), SequenceFileInputFormat.class,
                UserVectorSplitterMapper.class);
        partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);
        partialMultiply.setMapOutputKeyClass(VarIntWritable.class);
        partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);
        partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);
        partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);
        partialMultiply.setOutputKeyClass(VarIntWritable.class);
        partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);
        partialMultiplyConf.setBoolean("mapred.compress.map.output", true);
        partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());

        if (usersFile != null) {
            partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);
        }

        if (userItemFile != null) {
            partialMultiplyConf.set(IDReader.USER_ITEM_FILE, userItemFile);
        }

        partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);

        boolean succeeded = partialMultiply.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        //filter out any users we don't care about
        /* convert the user/item pairs to filter if a filterfile has been specified */
        if (filterFile != null) {
            Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                    ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                    ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                    SequenceFileOutputFormat.class);
            boolean succeeded = itemFiltering.waitForCompletion(true);
            if (!succeeded) {
                return -1;
            }
        }

        String aggregateAndRecommendInput = partialMultiplyPath.toString();
        if (filterFile != null) {
            aggregateAndRecommendInput += "," + explicitFilterPath;
        }

        Class<? extends OutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput")
                ? SequenceFileOutputFormat.class
                : TextOutputFormat.class;

        //extract out the recommendations
        Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath,
                SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class,
                PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class,
                VarLongWritable.class, RecommendedItemsWritable.class, outputFormat);
        Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
        if (itemsFile != null) {
            aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
        }

        if (userItemFile != null) {
            aggregateAndRecommendConf.set(IDReader.USER_ITEM_FILE, userItemFile);
        }

        if (filterFile != null) {
            setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath,
                    explicitFilterPath);
        }
        setIOSort(aggregateAndRecommend);
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
        aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
        boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}