Example usage for org.apache.mahout.math.hadoop DistributedRowMatrix setConf

Introduction

In this page you can find the example usage for org.apache.mahout.math.hadoop DistributedRowMatrix setConf.

Prototype

@Override
    public void setConf(Configuration conf)

Source Link

Usage

From source file:com.mycompany.MyHadoopSamples1.TransposeJob.java

License:Apache License

public int run(String[] strings) throws Exception {
    addInputOption();/*  w ww . j  a  v  a  2  s.  c  o  m*/
    addOption("numRows", "nr", "Number of rows of the input matrix");
    addOption("numCols", "nc", "Number of columns of the input matrix");
    Map<String, String> parsedArgs = parseArguments(strings);
    if (parsedArgs == null) {
        return -1;
    }

    int numRows = Integer.parseInt(parsedArgs.get("--numRows"));
    int numCols = Integer.parseInt(parsedArgs.get("--numCols"));

    DistributedRowMatrix matrix = new DistributedRowMatrix(getInputPath(), getTempPath(), numRows, numCols);
    matrix.setConf(new Configuration(getConf()));
    matrix.transpose();

    return 0;
}

From source file:com.skp.experiment.cf.als.hadoop.DistributedParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//w ww .j  a  va 2 s.  c om
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("numUsers", null, "number of users", true);
    addOption("numItems", null, "number of items", true);
    addOption("blockSize", null, "dfs block size.", false);
    //addOption("runIterations", null, "true or false for iterations", true);

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
    numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
    lambda = Double.parseDouble(parsedArgs.get("--lambda"));
    alpha = Double.parseDouble(parsedArgs.get("--alpha"));
    implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
    numUsers = Integer.parseInt(parsedArgs.get("--numUsers"));
    numItems = Integer.parseInt(parsedArgs.get("--numItems"));
    dfsBlockSize = getOption("blockSize") == null ? 64 * 1024 * 1024 : Long.parseLong(getOption("blockSize"));
    /*
        * compute the factorization A = U M'
        *
        * where A (users x items) is the matrix of known ratings
        *           U (users x features) is the representation of users in the feature space
        *           M (items x features) is the representation of items in the feature space
        */

    /* create A' */
    Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class,
            ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
            IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    itemRatings.setCombinerClass(VectorSumReducer.class);
    itemRatings.waitForCompletion(true);
    //numItems = 
    //    (int) itemRatings.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
    log.info("Number of Items\t{}", numItems);

    /* create A */
    Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
            IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
            VectorWritable.class);
    userRatings.setCombinerClass(MergeVectorsCombiner.class);
    userRatings.waitForCompletion(true);
    //numUsers = 
    //    (int) userRatings.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
    log.info("Number of Users\t{}", numUsers);

    /* count item per user */
    Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnts"),
            SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class, IntWritable.class,
            UserItemCntsReducer.class, IntWritable.class, IntWritable.class, SequenceFileOutputFormat.class);
    userItemCntsJob.setJobName("user ratings count");
    userItemCntsJob.setCombinerClass(UserItemCntsReducer.class);
    userItemCntsJob.waitForCompletion(true);

    //TODO this could be fiddled into one of the upper jobs
    Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
            AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
            IntWritable.class, VectorWritable.class);
    averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
    averageItemRatings.waitForCompletion(true);

    Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());

    /* create an initial M */
    initializeM(averageRatings);

    for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
        DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration - 1),
                getTempPath("Mtemp" + String.valueOf(currentIteration - 1)), numItems, numFeatures);
        curM.setConf(new Configuration());
        DistributedRowMatrix YtransposeY = curM.times(curM);

        // broadcast M, read A row-wise, recompute U row-wise //
        log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
        runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                YtransposeY.getRowPath(), numItems);

        DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration),
                getTempPath("Utmp" + String.valueOf(currentIteration)), numUsers, numFeatures);
        curU.setConf(new Configuration());
        DistributedRowMatrix XtransposeX = curU.times(curU);

        // set up index of U //
        CreateMapFileFromSeq.createMapFile(pathToU(currentIteration));

        // broadcast U, read A' row-wise, recompute M row-wise //
        log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
        runDistributedImplicitSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration),
                XtransposeX.getRowPath(), numUsers);
    }
    return 0;
}

From source file:com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from  w w  w . j  a v  a  2  s .co  m*/
    addOutputOption();
    addOption("lambda", null, "regularization parameter", true);
    addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
    addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
    addOption("numFeatures", null, "dimension of the feature space", true);
    addOption("numIterations", null, "number of iterations", true);
    addOption("indexSizes", null, "index sizes Path", true);
    addOption("startIteration", null, "start iteration number", String.valueOf(0));
    addOption("oldM", null, "old M matrix Path.", null);
    addOption("largeUserFeatures", null, "true if user x feature matrix is too large for memory",
            String.valueOf(true));
    addOption("rmseCurve", null, "true if want to extract rmse curve", String.valueOf(true));
    addOption("cleanUp", null, "true if want to clean up temporary matrix", String.valueOf(true));
    addOption("useTransform", null, "true if using logarithm as transform", String.valueOf(true));
    addOption("rateIndex", null, "0 based index for rate column in input file.", String.valueOf(2));
    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    try {
        /** step 0: fetch dimention of training set matrix. */
        Map<String, String> indexSizesTmp = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")),
                DELIMETER, Arrays.asList(0), Arrays.asList(1));

        numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
        numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
        lambda = Double.parseDouble(parsedArgs.get("--lambda"));
        alpha = Double.parseDouble(parsedArgs.get("--alpha"));
        implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
        numUsers = Integer.parseInt(indexSizesTmp.get("0"));
        numItems = Integer.parseInt(indexSizesTmp.get("1"));

        numTaskTrackers = HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks;
        startIteration = Integer.parseInt(parsedArgs.get("--startIteration"));
        largeUserFeatures = Boolean.parseBoolean(getOption("largeUserFeatures"));
        useRMSECurve = Boolean.parseBoolean(getOption("rmseCurve"));
        cleanUp = Boolean.parseBoolean(getOption("cleanUp"));
        useTransform = Boolean.parseBoolean(getOption("useTransform"));
        rateIndex = Integer.parseInt(getOption("rateIndex"));
        FileSystem fs = FileSystem.get(getConf());
        if (!fs.exists(pathToTransformed())) {
            if (useTransform) {
                // transform price into rating
                Job transformJob = prepareJob(getInputPath(), pathToTransformed(), TextInputFormat.class,
                        TransformColumnValueMapper.class, NullWritable.class, Text.class,
                        TextOutputFormat.class);
                transformJob.waitForCompletion(true);
            } else {

                FileUtil.copy(FileSystem.get(getConf()), getInputPath(), FileSystem.get(getConf()),
                        pathToTransformed(), false, getConf());
            }
        }
        /*
        if (getOption("oldM") != null) {
          runOnetimeSolver(pathToTransformed(), getOutputPath("U"), new Path(getOption("oldM")));
          return 0;
        }
        */
        /*
            * compute the factorization A = U M'
            *
            * where A (users x items) is the matrix of known ratings
            *           U (users x features) is the representation of users in the feature space
            *           M (items x features) is the representation of items in the feature space
            */
        if (startIteration == 0) {
            if (!fs.exists(pathToItemRatings())) {
                // create A' 
                Job itemRatings = prepareJob(pathToTransformed(), pathToItemRatings(), TextInputFormat.class,
                        ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class,
                        VectorSumReducer.class, IntWritable.class, VectorWritable.class,
                        SequenceFileOutputFormat.class);
                itemRatings.setCombinerClass(VectorSumReducer.class);
                long matrixSizeExp = (long) (8L * numUsers * numFeatures * SAFE_MARGIN);
                long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT
                        / (long) HadoopClusterUtil.MAP_TASKS_PER_NODE;
                int numTaskPerDataNode = Math.max(1,
                        (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (double) matrixSizeExp));
                //log.info("matrix Size: " + matrixSizeExp + ", memorhThreshold: " + memoryThreshold + ", numTaskPerDataNode: " + numTaskPerDataNode);
                if (matrixSizeExp > memoryThreshold) {
                    //log.info("A: {}", numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
                    int numReducer = Math.min(
                            numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()),
                            HadoopClusterUtil.getMaxMapTasks(getConf()));
                    //log.info("Number Of Reducer: " + numReducer);
                    itemRatings.setNumReduceTasks(numReducer);
                }

                itemRatings.waitForCompletion(true);
            }

            if (!fs.exists(pathToUserRatings())) {
                Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
                        IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                        VectorWritable.class);
                userRatings.setNumReduceTasks(HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
                userRatings.setCombinerClass(MergeVectorsCombiner.class);
                userRatings.setNumReduceTasks(HadoopClusterUtil.getMaxMapTasks(getConf()));
                userRatings.waitForCompletion(true);
            }
            if (!fs.exists(getOutputPath("userItemCnt"))) {
                // count item per user
                Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnt"),
                        SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class,
                        IntWritable.class, SequenceFileOutputFormat.class);
                userItemCntsJob.setJobName("user ratings count");
                userItemCntsJob.waitForCompletion(true);
            }

            if (!fs.exists(getTempPath("averageRatings"))) {
                //TODO this could be fiddled into one of the upper jobs
                Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
                        AverageRatingMapper.class, IntWritable.class, VectorWritable.class,
                        MergeVectorsReducer.class, IntWritable.class, VectorWritable.class);
                averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
                averageItemRatings.waitForCompletion(true);
            }
            if (!fs.exists(new Path(pathToM(-1), "part-m-00000"))) {
                Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());

                /** create an initial M */
                initializeM(averageRatings);
            }
        }

        for (int currentIteration = startIteration; currentIteration < numIterations; currentIteration++) {
            DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration - 1),
                    getTempPath("Mtemp/tmp-" + String.valueOf(currentIteration - 1) + "/M"), numItems,
                    numFeatures);
            curM.setConf(getConf());
            DistributedRowMatrix YtransposeY = curM.times(curM);
            /** broadcast M, read A row-wise, recompute U row-wise */
            log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                    YtransposeY.getRowPath(), numItems, false);

            DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration),
                    getTempPath("Utmp/tmp-" + String.valueOf(currentIteration) + "/U"), numUsers, numFeatures);
            curU.setConf(getConf());
            DistributedRowMatrix XtransposeX = curU.times(curU);

            /** broadcast U, read A' row-wise, recompute M row-wise */
            log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration),
                    XtransposeX.getRowPath(), numUsers, largeUserFeatures);

            /** calculate rmse on each updated matrix U, M and decide to further iteration */
            if (currentIteration > startIteration && useRMSECurve) {
                Pair<Integer, Double> UsquaredError = calculateMatrixDistanceSquared(
                        pathToU(currentIteration - 1), pathToU(currentIteration), currentIteration);
                Pair<Integer, Double> MsquaredError = calculateMatrixDistanceSquared(
                        pathToM(currentIteration - 1), pathToM(currentIteration), currentIteration);
                String currentRMSE = currentIteration + DELIMETER + UsquaredError.getFirst() + DELIMETER
                        + UsquaredError.getSecond() + DELIMETER + MsquaredError.getFirst() + DELIMETER
                        + MsquaredError.getSecond() + DefaultOptionCreator.NEWLINE;
                rmsePerIteration += currentRMSE;
                log.info("iteration {}: {}", currentIteration, currentRMSE);
            }
            if (currentIteration >= startIteration + 2 && cleanUp) {
                fs.deleteOnExit(pathToU(currentIteration - 2));
                fs.deleteOnExit(pathToM(currentIteration - 2));
            }
        }
        return 0;
    } catch (Exception e) {
        e.printStackTrace();
        return -1;
    } finally {
        if (useRMSECurve) {
            HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("RMSE"), rmsePerIteration);
        }
    }
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Assuming that the input is generated by {@link MatrixOutputFormat}, this method
 * convert it to a centralized dense matrix.
 * @param inPath the path to the {@link MapDir matrix}
 * @param nRows/*from   w  w w  .j  a v  a 2 s  . c o  m*/
 * @param nCols
 * @param conf
 * @return
 * @throws IOException
 */
public static DenseMatrix mapDirToDenseMatrix(Path inPath, int nRows, int nCols, Configuration conf)
        throws IOException {
    Path tmpPath = inPath.getParent();
    DistributedRowMatrix distMatrix = new DistributedRowMatrix(inPath, tmpPath, nRows, nCols);
    distMatrix.setConf(conf);
    return toDenseMatrix(distMatrix);
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Assuming that the input is generated by {@link MatrixOutputFormat}, this method
 * convert it to a centralized sparse matrix.
 * @param inPath the path to the {@link MapDir matrix}
 * @param nRows/*  w w  w. j  a  v a2 s.c om*/
 * @param nCols
 * @param conf
 * @return
 * @throws IOException
 */
public static SparseMatrix mapDirToSparseMatrix(Path inPath, int nRows, int nCols, Configuration conf)
        throws IOException {
    Path tmpPath = inPath.getParent();
    DistributedRowMatrix distMatrix = new DistributedRowMatrix(inPath, tmpPath, nRows, nCols);
    distMatrix.setConf(conf);
    return toSparseMatrix(distMatrix);
}

From source file:com.twitter.algebra.AlgebraCommon.java

License:Apache License

/**
 * Convert an in-memory representation of a matrix to a distributed MapDir
 * format. It then can be used in distributed jobs
 * /*  w w  w. j ava2 s.c o  m*/
 * @param oriMatrix
 * @return path that will contain the matrix files
 * @throws Exception
 */
public static DistributedRowMatrix toMapDir(Matrix origMatrix, Path outPath, Path tmpPath, String label)
        throws Exception {
    Configuration conf = new Configuration();
    Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x" + origMatrix.numCols());
    FileSystem fs = FileSystem.get(outputDir.toUri(), conf);
    if (!fs.exists(outputDir)) {
        Path mapDir = new Path(outputDir, "matrix-k-0");
        Path outputFile = new Path(mapDir, "data");
        @SuppressWarnings("deprecation")
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputFile, IntWritable.class,
                VectorWritable.class);
        VectorWritable vectorw = new VectorWritable();
        IntWritable intw = new IntWritable();
        try {
            for (int r = 0; r < origMatrix.numRows(); r++) {
                Vector vector = origMatrix.viewRow(r);
                vectorw.set(vector);
                intw.set(r);
                writer.append(intw, vectorw);
            }
        } finally {
            writer.close();
        }
        MapFile.fix(fs, mapDir, IntWritable.class, VectorWritable.class, false, conf);
    } else {
        log.warn("----------- Skip matrix " + outputDir + " - already exists");
    }
    DistributedRowMatrix dMatrix = new DistributedRowMatrix(outputDir, tmpPath, origMatrix.numRows(),
            origMatrix.numCols());
    dMatrix.setConf(conf);
    return dMatrix;
}

From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + Sequence2MatrixFormatJob.class.getName());

    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    Sequence2MatrixFormatJob job = new Sequence2MatrixFormatJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), outPath);
    } else {/*from   w w  w.j a  va  2s.  c  o  m*/
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            A.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABInnerHDFSBroadcastOfB} for further details.
 * //ww  w.ja va 2  s.c  om
 * @param conf the initial configuration
 * @param A matrix A
 * @param B matrix B
 * @param label the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABInnerHDFSBroadcastOfB.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABInnerHDFSBroadcastOfB job = new ABInnerHDFSBroadcastOfB();

    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numRows(), B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java

License:Apache License

/**
 * Perform A x B, where A and B are already wrapped in a DistributedRowMatrix
 * object. Refer to {@link ABOuterHDFSBroadcastOfA} for further details.
 * //from  ww w  .  ja  va 2  s  .c o m
 * @param conf
 *          the initial configuration
 * @param A
 *          matrix A
 * @param B
 *          matrix B
 * @param label
 *          the label for the output directory
 * @return AxB wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + ABOuterHDFSBroadcastOfA.class.getName());
    if (A.numCols() != B.numRows()) {
        throw new CardinalityException(A.numCols(), B.numRows());
    }
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    ABOuterHDFSBroadcastOfA job = new ABOuterHDFSBroadcastOfA();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, A.numRows(), A.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numRows(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}

From source file:com.twitter.algebra.matrix.multiply.AtBOuterStaticMapsideJoinJob.java

License:Apache License

public static DistributedRowMatrix run(Configuration conf, DistributedRowMatrix A, DistributedRowMatrix B,
        String label) throws IOException, InterruptedException, ClassNotFoundException {
    log.info("running " + AtBOuterStaticMapsideJoinJob.class.getName());
    if (A.numRows() != B.numRows()) {
        throw new CardinalityException(A.numRows(), B.numRows());
    }/*w ww  .  ja v  a  2s.c  o m*/
    Path outPath = new Path(A.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    AtBOuterStaticMapsideJoinJob job = new AtBOuterStaticMapsideJoinJob();
    if (!fs.exists(outPath)) {
        job.run(conf, A.getRowPath(), B.getRowPath(), outPath, B.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outPath);
    }
    DistributedRowMatrix distRes = new DistributedRowMatrix(outPath, A.getOutputTempPath(), A.numCols(),
            B.numCols());
    distRes.setConf(conf);
    return distRes;
}