Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:DAAL.DistributedHDFSDataSet.java

License:Open Source License

public JavaPairRDD<Integer, HomogenNumericTable> getAsPairRDDPartitioned(JavaSparkContext sc, int minPartitions,
        final long maxRowsPerTable) {
    JavaRDD<String> rawData = sc.textFile(_filename, minPartitions);
    JavaPairRDD<String, Long> dataWithId = rawData.zipWithIndex();

    JavaPairRDD<Integer, HomogenNumericTable> data = dataWithId.mapPartitionsToPair(
            new PairFlatMapFunction<Iterator<Tuple2<String, Long>>, Integer, HomogenNumericTable>() {
                public List<Tuple2<Integer, HomogenNumericTable>> call(Iterator<Tuple2<String, Long>> it) {

                    DaalContext context = new DaalContext();
                    long maxRows = maxRowsPerTable;
                    long curRow = 0;
                    ArrayList<Tuple2<Integer, HomogenNumericTable>> tables = new ArrayList<Tuple2<Integer, HomogenNumericTable>>();

                    StringDataSource dataSource = new StringDataSource(context, "");

                    while (it.hasNext()) {

                        dataSource.setData(it.next()._1);
                        dataSource.loadDataBlock(1, curRow, maxRows);

                        curRow++;/*w ww .j av  a 2  s . c  o  m*/

                        if (curRow == maxRows || !(it.hasNext())) {
                            HomogenNumericTable table = (HomogenNumericTable) dataSource.getNumericTable();
                            table.setNumberOfRows(curRow);
                            table.pack();

                            Tuple2<Integer, HomogenNumericTable> tuple = new Tuple2<Integer, HomogenNumericTable>(
                                    0, table);
                            tables.add(tuple);

                            dataSource = new StringDataSource(context, "");

                            curRow = 0;
                        }
                    }

                    context.dispose();

                    return tables;
                }
            });

    return data;
}

From source file:DAAL.SparkImplicitALSSparse.java

License:Open Source License

public static JavaPairRDD<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>> redistributeBlocks(
        JavaPairRDD<Integer, DistributedPartialResultStep2> step2MasterResult,
        JavaPairRDD<Integer, DistributedPartialResultStep3> step3LocalResult,
        JavaPairRDD<Integer, CSRNumericTable> dataRDD) {

    JavaPairRDD<Integer, Tuple2<Integer, PartialModel>> step3LocalResultKeyValue = step3LocalResult
            .flatMapToPair(/*from  w ww  . j  a  va  2 s  . co m*/
                    new PairFlatMapFunction<Tuple2<Integer, DistributedPartialResultStep3>, Integer, Tuple2<Integer, PartialModel>>() {
                        public Iterable<Tuple2<Integer, Tuple2<Integer, PartialModel>>> call(
                                Tuple2<Integer, DistributedPartialResultStep3> tup) {
                            DaalContext context = new DaalContext();
                            DistributedPartialResultStep3 partialResultStep3 = tup._2;
                            partialResultStep3.unpack(context);

                            KeyValueDataCollection collection = partialResultStep3
                                    .get(DistributedPartialResultStep3Id.outputOfStep3ForStep4);

                            List<Tuple2<Integer, Tuple2<Integer, PartialModel>>> list = new LinkedList<Tuple2<Integer, Tuple2<Integer, PartialModel>>>();

                            for (int i = 0; i < collection.size(); i++) {
                                PartialModel m1 = (PartialModel) collection.getValueByIndex(i);
                                m1.pack();
                                Tuple2<Integer, PartialModel> blockFromIdWithModel = new Tuple2<Integer, PartialModel>(
                                        tup._1, m1);
                                Tuple2<Integer, Tuple2<Integer, PartialModel>> blockToIdWithTuple = new Tuple2<Integer, Tuple2<Integer, PartialModel>>(
                                        (int) collection.getKeyByIndex(i), blockFromIdWithModel);
                                list.add(blockToIdWithTuple);
                            }
                            context.dispose();

                            return list;
                        }
                    });

    JavaPairRDD<Integer, Iterable<Tuple2<Integer, PartialModel>>> blocksFromOtherNodes = step3LocalResultKeyValue
            .groupByKey();

    JavaPairRDD<Integer, Tuple2<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>>> rddWithData = dataRDD
            .join(blocksFromOtherNodes);

    JavaPairRDD<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>> rddToCompute = rddWithData
            .join(step2MasterResult).mapToPair(
                    new PairFunction<Tuple2<Integer, Tuple2<Tuple2<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>>, DistributedPartialResultStep2>>, Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>>() {
                        public Tuple2<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>> call(
                                Tuple2<Integer, Tuple2<Tuple2<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>>, DistributedPartialResultStep2>> tup) {
                            return new Tuple2<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>>(
                                    tup._1,
                                    new Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>(
                                            tup._2._1._1, tup._2._1._2, tup._2._2));
                        }
                    });
    return rddToCompute;
}

From source file:esiptestbed.mudrod.utils.MatrixUtil.java

License:Apache License

/**
 * createWordDocMatrix:Create matrix from doc-terms JavaPairRDD.
 *
 * @param uniqueDocRDD/*from  ww  w.j a va  2 s .  co  m*/
 *          doc-terms JavaPairRDD, in which each key is a doc name, and value
 *          is term list extracted from that doc
 * @param sc
 *          spark context
 * @return LabeledRowMatrix {@link esiptestbed.mudrod.utils.LabeledRowMatrix}
 */
public static LabeledRowMatrix createWordDocMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD,
        JavaSparkContext sc) {
    // Index documents with unique IDs
    JavaPairRDD<List<String>, Long> corpus = uniqueDocRDD.values().zipWithIndex();
    // cal word-doc numbers
    JavaPairRDD<Tuple2<String, Long>, Double> worddoc_num_RDD = corpus
            .flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, Long>, Tuple2<String, Long>, Double>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Iterator<Tuple2<Tuple2<String, Long>, Double>> call(Tuple2<List<String>, Long> docwords)
                        throws Exception {
                    List<Tuple2<Tuple2<String, Long>, Double>> pairs = new ArrayList<Tuple2<Tuple2<String, Long>, Double>>();
                    List<String> words = docwords._1;
                    int n = words.size();
                    for (int i = 0; i < n; i++) {
                        Tuple2<String, Long> worddoc = new Tuple2<String, Long>(words.get(i), docwords._2);
                        pairs.add(new Tuple2<Tuple2<String, Long>, Double>(worddoc, 1.0));
                    }
                    return pairs.iterator();
                }
            }).reduceByKey(new Function2<Double, Double, Double>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Double call(Double first, Double second) throws Exception {
                    return first + second;
                }
            });
    // cal word doc-numbers
    JavaPairRDD<String, Tuple2<List<Long>, List<Double>>> word_docnum_RDD = worddoc_num_RDD.mapToPair(
            new PairFunction<Tuple2<Tuple2<String, Long>, Double>, String, Tuple2<List<Long>, List<Double>>>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(
                        Tuple2<Tuple2<String, Long>, Double> worddoc_num) throws Exception {
                    List<Long> docs = new ArrayList<Long>();
                    docs.add(worddoc_num._1._2);
                    List<Double> nums = new ArrayList<Double>();
                    nums.add(worddoc_num._2);
                    Tuple2<List<Long>, List<Double>> docmums = new Tuple2<List<Long>, List<Double>>(docs, nums);
                    return new Tuple2<String, Tuple2<List<Long>, List<Double>>>(worddoc_num._1._1, docmums);
                }
            });
    // trans to vector
    final int corporsize = (int) uniqueDocRDD.keys().count();
    JavaPairRDD<String, Vector> word_vectorRDD = word_docnum_RDD.reduceByKey(
            new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0,
                        Tuple2<List<Long>, List<Double>> arg1) throws Exception {
                    arg0._1.addAll(arg1._1);
                    arg0._2.addAll(arg1._2);
                    return new Tuple2<List<Long>, List<Double>>(arg0._1, arg0._2);
                }
            }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0)
                        throws Exception {
                    int docsize = arg0._2._1.size();
                    int[] intArray = new int[docsize];
                    double[] doubleArray = new double[docsize];
                    for (int i = 0; i < docsize; i++) {
                        intArray[i] = arg0._2._1.get(i).intValue();
                        doubleArray[i] = arg0._2._2.get(i).intValue();
                    }
                    Vector sv = Vectors.sparse(corporsize, intArray, doubleArray);
                    return new Tuple2<String, Vector>(arg0._1, sv);
                }
            });

    RowMatrix wordDocMatrix = new RowMatrix(word_vectorRDD.values().rdd());

    LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix();
    labeledRowMatrix.wordDocMatrix = wordDocMatrix;
    labeledRowMatrix.words = word_vectorRDD.keys().collect();
    labeledRowMatrix.docs = uniqueDocRDD.keys().collect();
    return labeledRowMatrix;
}

From source file:esiptestbed.mudrod.utils.MatrixUtil.java

License:Apache License

public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD,
        JavaSparkContext sc) {/*from  w ww. j a va2s .  co m*/
    // Index word with unique IDs
    JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values()
            .flatMap(new FlatMapFunction<List<String>, String>() {
                /**
                 * 
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Iterator<String> call(List<String> arg0) throws Exception {
                    return arg0.iterator();
                }
            }).distinct().zipWithIndex();

    //
    JavaPairRDD<Tuple2<String, String>, Double> docword_num_RDD = uniqueDocRDD.flatMapToPair(
            new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() {

                /**
                 * 
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Iterator<Tuple2<Tuple2<String, String>, Double>> call(
                        Tuple2<String, List<String>> docwords) throws Exception {
                    List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<Tuple2<Tuple2<String, String>, Double>>();
                    List<String> words = docwords._2;
                    int n = words.size();
                    for (int i = 0; i < n; i++) {
                        Tuple2<String, String> worddoc = new Tuple2<String, String>(docwords._1, words.get(i));
                        pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0));
                    }
                    return pairs.iterator();
                }
            }).reduceByKey(new Function2<Double, Double, Double>() {
                /**
                 * 
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Double call(Double first, Double second) throws Exception {
                    return first + second;
                }
            });

    //
    JavaPairRDD<String, Tuple2<String, Double>> word_docnum_RDD = docword_num_RDD.mapToPair(
            new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() {
                /**
                 * 
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0)
                        throws Exception {

                    Tuple2<String, Double> wordmums = new Tuple2<String, Double>(arg0._1._1, arg0._2);
                    return new Tuple2<String, Tuple2<String, Double>>(arg0._1._2, wordmums);
                }
            });

    //

    JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = word_docnum_RDD
            .leftOuterJoin(wordIDRDD);

    int wordsize = (int) wordIDRDD.count();
    JavaPairRDD<String, Vector> doc_vectorRDD = testRDD.mapToPair(
            new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() {
                /**
                 * 
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(
                        Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception {
                    Optional<Long> oid = arg0._2._2;
                    Long wordId = (long) 0;
                    if (oid.isPresent()) {
                        wordId = oid.get();
                    }

                    List<Long> word = new ArrayList<Long>();
                    word.add(wordId);

                    List<Double> count = new ArrayList<Double>();
                    count.add(arg0._2._1._2);

                    Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<List<Long>, List<Double>>(word,
                            count);

                    return new Tuple2<String, Tuple2<List<Long>, List<Double>>>(arg0._2._1._1, wordcount);
                }

            }).reduceByKey(
                    new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() {
                        /**
                         *
                         */
                        private static final long serialVersionUID = 1L;

                        @Override
                        public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0,
                                Tuple2<List<Long>, List<Double>> arg1) throws Exception {
                            arg0._1.addAll(arg1._1);
                            arg0._2.addAll(arg1._2);
                            return new Tuple2<List<Long>, List<Double>>(arg0._1, arg0._2);
                        }
                    })
            .mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0)
                        throws Exception {
                    int docsize = arg0._2._1.size();
                    int[] intArray = new int[docsize];
                    double[] doubleArray = new double[docsize];
                    for (int i = 0; i < docsize; i++) {
                        intArray[i] = arg0._2._1.get(i).intValue();
                        doubleArray[i] = arg0._2._2.get(i).intValue();
                    }
                    Vector sv = Vectors.sparse(wordsize, intArray, doubleArray);
                    return new Tuple2<String, Vector>(arg0._1, sv);
                }
            });

    RowMatrix docwordMatrix = new RowMatrix(doc_vectorRDD.values().rdd());

    LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix();
    labeledRowMatrix.wordDocMatrix = docwordMatrix;
    labeledRowMatrix.words = doc_vectorRDD.keys().collect();
    labeledRowMatrix.docs = wordIDRDD.keys().collect();

    return labeledRowMatrix;
}

From source file:fiji.TemporalMedian.java

License:Open Source License

/** Remove first array of pixels and shift the others to the left. */
final List<float[]> rmFirst(List<float[]> tWinPix, int wmax) {
    {//from   ww w. j  a v  a 2s .  c  om
        int i = 0;
        i = 0;
        boolean loop$2 = false;
        loop$2 = false;
        SparkConf conf = new SparkConf().setAppName("spark");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<float[]> rdd_0_2_0 = sc.parallelize(tWinPix);
        JavaPairRDD<float[], Long> rdd_0_2_1 = rdd_0_2_0.zipWithIndex();
        final int wmax_final = wmax;
        final boolean loop2_final = loop$2;

        JavaPairRDD<Integer, float[]> mapEmits = rdd_0_2_1
                .flatMapToPair(new PairFlatMapFunction<Tuple2<float[], Long>, Integer, float[]>() {
                    public Iterator<Tuple2<Integer, float[]>> call(Tuple2<float[], Long> casper_data_set_i)
                            throws Exception {
                        List<Tuple2<Integer, float[]>> emits = new ArrayList<Tuple2<Integer, float[]>>();

                        if (casper_data_set_i._2 < wmax_final)
                            emits.add(new Tuple2(casper_data_set_i._2 - 1, casper_data_set_i._1));

                        return emits.iterator();
                    }
                });

        Map<Integer, float[]> output_rdd_0_2 = mapEmits.collectAsMap();
        for (Integer output_rdd_0_2_k : output_rdd_0_2.keySet()) {
            tWinPix.set(output_rdd_0_2_k, output_rdd_0_2.get(output_rdd_0_2_k));
        }
    }
    return tWinPix;
}

From source file:fiji.TemporalMedian.java

License:Open Source License

/** Standard deviation of a vector of float values. */
final float calcSD(List<Float> vec) {
    float sd;/*ww  w  .j a  va2 s  .co  m*/
    sd = 0;
    float mean;
    mean = 0;
    float variance;
    variance = 0;
    {
        SparkConf conf = new SparkConf().setAppName("spark");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<java.lang.Float> rdd_0_3 = sc.parallelize(vec);

        JavaPairRDD<Integer, Float> mapEmits = rdd_0_3
                .flatMapToPair(new PairFlatMapFunction<java.lang.Float, Integer, Float>() {
                    public Iterator<Tuple2<Integer, Float>> call(java.lang.Float vec_t) throws Exception {
                        List<Tuple2<Integer, Float>> emits = new ArrayList<Tuple2<Integer, Float>>();

                        emits.add(new Tuple2(1, new Tuple2(1, vec_t)));

                        return emits.iterator();
                    }
                });

        JavaPairRDD<Integer, Float> reduceEmits = mapEmits.reduceByKey(new Function2<Float, Float, Float>() {
            public Float call(Float val1, Float val2) throws Exception {
                return (val1 + val2);
            }
        });

        Map<Integer, Float> output_rdd_0_3 = reduceEmits.collectAsMap();
        mean = output_rdd_0_3.get(1);
    }
    float flat$0 = (float) vec.size();
    float flat$1 = mean / flat$0;
    mean = flat$0;
    {
        SparkConf conf = new SparkConf().setAppName("spark");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<java.lang.Float> rdd_0_4 = sc.parallelize(vec);
        final double mean_final = mean;

        JavaPairRDD<Integer, Float> mapEmits = rdd_0_4
                .flatMapToPair(new PairFlatMapFunction<java.lang.Float, Integer, Float>() {
                    public Iterator<Tuple2<Integer, Float>> call(java.lang.Float vec_i) throws Exception {
                        List<Tuple2<Integer, Float>> emits = new ArrayList<Tuple2<Integer, Float>>();

                        emits.add(new Tuple2(1, (vec_i - mean_final) * (vec_i - mean_final)));

                        return emits.iterator();
                    }
                });

        JavaPairRDD<Integer, Float> reduceEmits = mapEmits.reduceByKey(new Function2<Float, Float, Float>() {
            public Float call(Float val1, Float val2) throws Exception {
                return (val1 + val2);
            }
        });

        Map<Integer, Float> output_rdd_0_0 = reduceEmits.collectAsMap();
        variance = output_rdd_0_0.get(1);
    }
    float flat$2 = (float) vec.size();
    float flat$3 = variance / flat$2;
    variance = flat$3;
    sd = (float) Math.sqrt(variance);
    return sd;
}

From source file:fiji.Trails.java

License:Open Source License

/** Calculate mean of array of floats. */
final float mean(List<Float> tvec) {
    float mean = 0;
    mean = 0;/*from   w  w w  .  j a v  a 2s.  co  m*/
    {
        int t = 0;
        t = 0;
        boolean loop$3 = false;
        loop$3 = false;
        SparkConf conf = new SparkConf().setAppName("spark");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<java.lang.Float> rdd_0_1 = sc.parallelize(tvec);
        final boolean loop0_final = loop$3;

        JavaPairRDD<Integer, Float> mapEmits = rdd_0_1
                .flatMapToPair(new PairFlatMapFunction<java.lang.Float, Integer, Float>() {
                    public Iterator<Tuple2<Integer, Float>> call(java.lang.Float tvec_t) throws Exception {
                        List<Tuple2<Integer, Float>> emits = new ArrayList<Tuple2<Integer, Float>>();

                        emits.add(new Tuple2(1, new Tuple2(1, tvec_t)));

                        return emits.iterator();
                    }
                });

        JavaPairRDD<Integer, Float> reduceEmits = mapEmits.reduceByKey(new Function2<Float, Float, Float>() {
            public Float call(Float val1, Float val2) throws Exception {
                return (val1 + val2);
            }
        });

        Map<Integer, Float> output_rdd_0_1 = reduceEmits.collectAsMap();
        mean = output_rdd_0_1.get(1);
    }
    int flat$0 = tvec.size();
    float flat$1 = (float) flat$0;
    float flat$2 = mean / flat$1;
    return flat$2;
}

From source file:fiji.Trails.java

License:Open Source License

/** Remove first array of pixels and shift the others to the left. */
final List<float[]> rmFirst(List<float[]> tWinPix, int wmax) {
    {//from  www. j a v  a2s  .co  m
        int i = 0;
        i = 0;
        boolean loop$4 = false;
        loop$4 = false;
        SparkConf conf = new SparkConf().setAppName("spark");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaRDD<float[]> rdd_0_2_0 = sc.parallelize(tWinPix);
        JavaPairRDD<float[], Long> rdd_0_2_1 = rdd_0_2_0.zipWithIndex();
        final int wmax_final = wmax;
        final boolean loop4_final = loop$4;

        JavaPairRDD<Integer, float[]> mapEmits = rdd_0_2_1
                .flatMapToPair(new PairFlatMapFunction<Tuple2<float[], Long>, Integer, float[]>() {
                    public Iterator<Tuple2<Integer, float[]>> call(Tuple2<float[], Long> casper_data_set_i)
                            throws Exception {
                        List<Tuple2<Integer, float[]>> emits = new ArrayList<Tuple2<Integer, float[]>>();

                        if (casper_data_set_i._2 < wmax_final)
                            emits.add(new Tuple2(casper_data_set_i._2 - 1, casper_data_set_i._1));

                        return emits.iterator();
                    }
                });

        Map<Integer, float[]> output_rdd_0_2 = mapEmits.collectAsMap();
        for (Integer output_rdd_0_2_k : output_rdd_0_2.keySet()) {
            tWinPix.set(output_rdd_0_2_k, output_rdd_0_2.get(output_rdd_0_2_k));
        }
    }
    return tWinPix;
}

From source file:gov.nasa.jpl.mudrod.utils.MatrixUtil.java

License:Apache License

/**
 * Create matrix from doc-terms JavaPairRDD.
 *
 * @param uniqueDocRDD/*  www  .  j  a va 2s.c o m*/
 *          doc-terms JavaPairRDD, in which each key is a doc name, and value
 *          is term list extracted from that doc
 * @return LabeledRowMatrix {@link LabeledRowMatrix}
 */
public static LabeledRowMatrix createWordDocMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD) {
    // Index documents with unique IDs
    JavaPairRDD<List<String>, Long> corpus = uniqueDocRDD.values().zipWithIndex();
    // cal word-doc numbers
    JavaPairRDD<Tuple2<String, Long>, Double> worddocNumRDD = corpus
            .flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, Long>, Tuple2<String, Long>, Double>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Iterator<Tuple2<Tuple2<String, Long>, Double>> call(Tuple2<List<String>, Long> docwords)
                        throws Exception {
                    List<Tuple2<Tuple2<String, Long>, Double>> pairs = new ArrayList<>();
                    List<String> words = docwords._1;
                    int n = words.size();
                    for (int i = 0; i < n; i++) {
                        Tuple2<String, Long> worddoc = new Tuple2<>(words.get(i), docwords._2);
                        pairs.add(new Tuple2<Tuple2<String, Long>, Double>(worddoc, 1.0));
                    }
                    return pairs.iterator();
                }
            }).reduceByKey(new Function2<Double, Double, Double>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Double call(Double first, Double second) throws Exception {
                    return first + second;
                }
            });
    // cal word doc-numbers
    JavaPairRDD<String, Tuple2<List<Long>, List<Double>>> wordDocnumRDD = worddocNumRDD.mapToPair(
            new PairFunction<Tuple2<Tuple2<String, Long>, Double>, String, Tuple2<List<Long>, List<Double>>>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(
                        Tuple2<Tuple2<String, Long>, Double> worddocNum) throws Exception {
                    List<Long> docs = new ArrayList<>();
                    docs.add(worddocNum._1._2);
                    List<Double> nums = new ArrayList<>();
                    nums.add(worddocNum._2);
                    Tuple2<List<Long>, List<Double>> docmums = new Tuple2<>(docs, nums);
                    return new Tuple2<>(worddocNum._1._1, docmums);
                }
            });
    // trans to vector
    final int corporsize = (int) uniqueDocRDD.keys().count();
    JavaPairRDD<String, Vector> wordVectorRDD = wordDocnumRDD.reduceByKey(
            new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0,
                        Tuple2<List<Long>, List<Double>> arg1) throws Exception {
                    arg0._1.addAll(arg1._1);
                    arg0._2.addAll(arg1._2);
                    return new Tuple2<>(arg0._1, arg0._2);
                }
            }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0)
                        throws Exception {
                    int docsize = arg0._2._1.size();
                    int[] intArray = new int[docsize];
                    double[] doubleArray = new double[docsize];
                    for (int i = 0; i < docsize; i++) {
                        intArray[i] = arg0._2._1.get(i).intValue();
                        doubleArray[i] = arg0._2._2.get(i).intValue();
                    }
                    Vector sv = Vectors.sparse(corporsize, intArray, doubleArray);
                    return new Tuple2<>(arg0._1, sv);
                }
            });

    RowMatrix wordDocMatrix = new RowMatrix(wordVectorRDD.values().rdd());

    LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix();
    labeledRowMatrix.rowMatrix = wordDocMatrix;
    labeledRowMatrix.rowkeys = wordVectorRDD.keys().collect();
    labeledRowMatrix.colkeys = uniqueDocRDD.keys().collect();
    return labeledRowMatrix;
}

From source file:gov.nasa.jpl.mudrod.utils.MatrixUtil.java

License:Apache License

public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD,
        JavaSparkContext sc) {//from   ww w .  j  a  va 2 s .  c om
    // Index word with unique IDs
    JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values()
            .flatMap(new FlatMapFunction<List<String>, String>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Iterator<String> call(List<String> arg0) throws Exception {
                    return arg0.iterator();
                }
            }).distinct().zipWithIndex();

    //
    JavaPairRDD<Tuple2<String, String>, Double> docwordNumRDD = uniqueDocRDD.flatMapToPair(
            new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() {

                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Iterator<Tuple2<Tuple2<String, String>, Double>> call(
                        Tuple2<String, List<String>> docwords) throws Exception {
                    List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<>();
                    List<String> words = docwords._2;
                    int n = words.size();
                    for (int i = 0; i < n; i++) {
                        Tuple2<String, String> worddoc = new Tuple2<>(docwords._1, words.get(i));
                        pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0));
                    }
                    return pairs.iterator();
                }
            }).reduceByKey(new Function2<Double, Double, Double>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Double call(Double first, Double second) throws Exception {
                    return first + second;
                }
            });

    //
    JavaPairRDD<String, Tuple2<String, Double>> wordDocnumRDD = docwordNumRDD.mapToPair(
            new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0)
                        throws Exception {

                    Tuple2<String, Double> wordmums = new Tuple2<>(arg0._1._1, arg0._2);
                    return new Tuple2<>(arg0._1._2, wordmums);
                }
            });

    //

    JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = wordDocnumRDD
            .leftOuterJoin(wordIDRDD);

    int wordsize = (int) wordIDRDD.count();
    JavaPairRDD<String, Vector> docVectorRDD = testRDD.mapToPair(
            new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Tuple2<List<Long>, List<Double>>> call(
                        Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception {
                    Optional<Long> oid = arg0._2._2;
                    Long wordId = (long) 0;
                    if (oid.isPresent()) {
                        wordId = oid.get();
                    }

                    List<Long> word = new ArrayList<>();
                    word.add(wordId);

                    List<Double> count = new ArrayList<>();
                    count.add(arg0._2._1._2);

                    Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<>(word, count);

                    return new Tuple2<>(arg0._2._1._1, wordcount);
                }

            }).reduceByKey(
                    new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() {
                        /**
                         *
                         */
                        private static final long serialVersionUID = 1L;

                        @Override
                        public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0,
                                Tuple2<List<Long>, List<Double>> arg1) throws Exception {
                            arg0._1.addAll(arg1._1);
                            arg0._2.addAll(arg1._2);
                            return new Tuple2<>(arg0._1, arg0._2);
                        }
                    })
            .mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() {
                /**
                 *
                 */
                private static final long serialVersionUID = 1L;

                @Override
                public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0)
                        throws Exception {
                    int docsize = arg0._2._1.size();
                    int[] intArray = new int[docsize];
                    double[] doubleArray = new double[docsize];
                    for (int i = 0; i < docsize; i++) {
                        intArray[i] = arg0._2._1.get(i).intValue();
                        doubleArray[i] = arg0._2._2.get(i).intValue();
                    }
                    Vector sv = Vectors.sparse(wordsize, intArray, doubleArray);
                    return new Tuple2<>(arg0._1, sv);
                }
            });

    RowMatrix docwordMatrix = new RowMatrix(docVectorRDD.values().rdd());

    LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix();
    labeledRowMatrix.rowMatrix = docwordMatrix;
    labeledRowMatrix.rowkeys = docVectorRDD.keys().collect();
    labeledRowMatrix.colkeys = wordIDRDD.keys().collect();

    return labeledRowMatrix;
}