List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction
PairFlatMapFunction
From source file:DAAL.DistributedHDFSDataSet.java
License:Open Source License
public JavaPairRDD<Integer, HomogenNumericTable> getAsPairRDDPartitioned(JavaSparkContext sc, int minPartitions, final long maxRowsPerTable) { JavaRDD<String> rawData = sc.textFile(_filename, minPartitions); JavaPairRDD<String, Long> dataWithId = rawData.zipWithIndex(); JavaPairRDD<Integer, HomogenNumericTable> data = dataWithId.mapPartitionsToPair( new PairFlatMapFunction<Iterator<Tuple2<String, Long>>, Integer, HomogenNumericTable>() { public List<Tuple2<Integer, HomogenNumericTable>> call(Iterator<Tuple2<String, Long>> it) { DaalContext context = new DaalContext(); long maxRows = maxRowsPerTable; long curRow = 0; ArrayList<Tuple2<Integer, HomogenNumericTable>> tables = new ArrayList<Tuple2<Integer, HomogenNumericTable>>(); StringDataSource dataSource = new StringDataSource(context, ""); while (it.hasNext()) { dataSource.setData(it.next()._1); dataSource.loadDataBlock(1, curRow, maxRows); curRow++;/*w ww .j av a 2 s . c o m*/ if (curRow == maxRows || !(it.hasNext())) { HomogenNumericTable table = (HomogenNumericTable) dataSource.getNumericTable(); table.setNumberOfRows(curRow); table.pack(); Tuple2<Integer, HomogenNumericTable> tuple = new Tuple2<Integer, HomogenNumericTable>( 0, table); tables.add(tuple); dataSource = new StringDataSource(context, ""); curRow = 0; } } context.dispose(); return tables; } }); return data; }
From source file:DAAL.SparkImplicitALSSparse.java
License:Open Source License
public static JavaPairRDD<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>> redistributeBlocks( JavaPairRDD<Integer, DistributedPartialResultStep2> step2MasterResult, JavaPairRDD<Integer, DistributedPartialResultStep3> step3LocalResult, JavaPairRDD<Integer, CSRNumericTable> dataRDD) { JavaPairRDD<Integer, Tuple2<Integer, PartialModel>> step3LocalResultKeyValue = step3LocalResult .flatMapToPair(/*from w ww . j a va 2 s . co m*/ new PairFlatMapFunction<Tuple2<Integer, DistributedPartialResultStep3>, Integer, Tuple2<Integer, PartialModel>>() { public Iterable<Tuple2<Integer, Tuple2<Integer, PartialModel>>> call( Tuple2<Integer, DistributedPartialResultStep3> tup) { DaalContext context = new DaalContext(); DistributedPartialResultStep3 partialResultStep3 = tup._2; partialResultStep3.unpack(context); KeyValueDataCollection collection = partialResultStep3 .get(DistributedPartialResultStep3Id.outputOfStep3ForStep4); List<Tuple2<Integer, Tuple2<Integer, PartialModel>>> list = new LinkedList<Tuple2<Integer, Tuple2<Integer, PartialModel>>>(); for (int i = 0; i < collection.size(); i++) { PartialModel m1 = (PartialModel) collection.getValueByIndex(i); m1.pack(); Tuple2<Integer, PartialModel> blockFromIdWithModel = new Tuple2<Integer, PartialModel>( tup._1, m1); Tuple2<Integer, Tuple2<Integer, PartialModel>> blockToIdWithTuple = new Tuple2<Integer, Tuple2<Integer, PartialModel>>( (int) collection.getKeyByIndex(i), blockFromIdWithModel); list.add(blockToIdWithTuple); } context.dispose(); return list; } }); JavaPairRDD<Integer, Iterable<Tuple2<Integer, PartialModel>>> blocksFromOtherNodes = step3LocalResultKeyValue .groupByKey(); JavaPairRDD<Integer, Tuple2<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>>> rddWithData = dataRDD .join(blocksFromOtherNodes); JavaPairRDD<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>> rddToCompute = rddWithData .join(step2MasterResult).mapToPair( new PairFunction<Tuple2<Integer, Tuple2<Tuple2<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>>, DistributedPartialResultStep2>>, Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>>() { public Tuple2<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>> call( Tuple2<Integer, Tuple2<Tuple2<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>>, DistributedPartialResultStep2>> tup) { return new Tuple2<Integer, Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>>( tup._1, new Tuple3<CSRNumericTable, Iterable<Tuple2<Integer, PartialModel>>, DistributedPartialResultStep2>( tup._2._1._1, tup._2._1._2, tup._2._2)); } }); return rddToCompute; }
From source file:esiptestbed.mudrod.utils.MatrixUtil.java
License:Apache License
/** * createWordDocMatrix:Create matrix from doc-terms JavaPairRDD. * * @param uniqueDocRDD/*from ww w.j a va 2 s . co m*/ * doc-terms JavaPairRDD, in which each key is a doc name, and value * is term list extracted from that doc * @param sc * spark context * @return LabeledRowMatrix {@link esiptestbed.mudrod.utils.LabeledRowMatrix} */ public static LabeledRowMatrix createWordDocMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) { // Index documents with unique IDs JavaPairRDD<List<String>, Long> corpus = uniqueDocRDD.values().zipWithIndex(); // cal word-doc numbers JavaPairRDD<Tuple2<String, Long>, Double> worddoc_num_RDD = corpus .flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, Long>, Tuple2<String, Long>, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Tuple2<String, Long>, Double>> call(Tuple2<List<String>, Long> docwords) throws Exception { List<Tuple2<Tuple2<String, Long>, Double>> pairs = new ArrayList<Tuple2<Tuple2<String, Long>, Double>>(); List<String> words = docwords._1; int n = words.size(); for (int i = 0; i < n; i++) { Tuple2<String, Long> worddoc = new Tuple2<String, Long>(words.get(i), docwords._2); pairs.add(new Tuple2<Tuple2<String, Long>, Double>(worddoc, 1.0)); } return pairs.iterator(); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double first, Double second) throws Exception { return first + second; } }); // cal word doc-numbers JavaPairRDD<String, Tuple2<List<Long>, List<Double>>> word_docnum_RDD = worddoc_num_RDD.mapToPair( new PairFunction<Tuple2<Tuple2<String, Long>, Double>, String, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<List<Long>, List<Double>>> call( Tuple2<Tuple2<String, Long>, Double> worddoc_num) throws Exception { List<Long> docs = new ArrayList<Long>(); docs.add(worddoc_num._1._2); List<Double> nums = new ArrayList<Double>(); nums.add(worddoc_num._2); Tuple2<List<Long>, List<Double>> docmums = new Tuple2<List<Long>, List<Double>>(docs, nums); return new Tuple2<String, Tuple2<List<Long>, List<Double>>>(worddoc_num._1._1, docmums); } }); // trans to vector final int corporsize = (int) uniqueDocRDD.keys().count(); JavaPairRDD<String, Vector> word_vectorRDD = word_docnum_RDD.reduceByKey( new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { arg0._1.addAll(arg1._1); arg0._2.addAll(arg1._2); return new Tuple2<List<Long>, List<Double>>(arg0._1, arg0._2); } }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { int docsize = arg0._2._1.size(); int[] intArray = new int[docsize]; double[] doubleArray = new double[docsize]; for (int i = 0; i < docsize; i++) { intArray[i] = arg0._2._1.get(i).intValue(); doubleArray[i] = arg0._2._2.get(i).intValue(); } Vector sv = Vectors.sparse(corporsize, intArray, doubleArray); return new Tuple2<String, Vector>(arg0._1, sv); } }); RowMatrix wordDocMatrix = new RowMatrix(word_vectorRDD.values().rdd()); LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); labeledRowMatrix.wordDocMatrix = wordDocMatrix; labeledRowMatrix.words = word_vectorRDD.keys().collect(); labeledRowMatrix.docs = uniqueDocRDD.keys().collect(); return labeledRowMatrix; }
From source file:esiptestbed.mudrod.utils.MatrixUtil.java
License:Apache License
public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) {/*from w ww. j a va2s . co m*/ // Index word with unique IDs JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values() .flatMap(new FlatMapFunction<List<String>, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(List<String> arg0) throws Exception { return arg0.iterator(); } }).distinct().zipWithIndex(); // JavaPairRDD<Tuple2<String, String>, Double> docword_num_RDD = uniqueDocRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Tuple2<String, String>, Double>> call( Tuple2<String, List<String>> docwords) throws Exception { List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<Tuple2<Tuple2<String, String>, Double>>(); List<String> words = docwords._2; int n = words.size(); for (int i = 0; i < n; i++) { Tuple2<String, String> worddoc = new Tuple2<String, String>(docwords._1, words.get(i)); pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0)); } return pairs.iterator(); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double first, Double second) throws Exception { return first + second; } }); // JavaPairRDD<String, Tuple2<String, Double>> word_docnum_RDD = docword_num_RDD.mapToPair( new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0) throws Exception { Tuple2<String, Double> wordmums = new Tuple2<String, Double>(arg0._1._1, arg0._2); return new Tuple2<String, Tuple2<String, Double>>(arg0._1._2, wordmums); } }); // JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = word_docnum_RDD .leftOuterJoin(wordIDRDD); int wordsize = (int) wordIDRDD.count(); JavaPairRDD<String, Vector> doc_vectorRDD = testRDD.mapToPair( new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<List<Long>, List<Double>>> call( Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception { Optional<Long> oid = arg0._2._2; Long wordId = (long) 0; if (oid.isPresent()) { wordId = oid.get(); } List<Long> word = new ArrayList<Long>(); word.add(wordId); List<Double> count = new ArrayList<Double>(); count.add(arg0._2._1._2); Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<List<Long>, List<Double>>(word, count); return new Tuple2<String, Tuple2<List<Long>, List<Double>>>(arg0._2._1._1, wordcount); } }).reduceByKey( new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { arg0._1.addAll(arg1._1); arg0._2.addAll(arg1._2); return new Tuple2<List<Long>, List<Double>>(arg0._1, arg0._2); } }) .mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { int docsize = arg0._2._1.size(); int[] intArray = new int[docsize]; double[] doubleArray = new double[docsize]; for (int i = 0; i < docsize; i++) { intArray[i] = arg0._2._1.get(i).intValue(); doubleArray[i] = arg0._2._2.get(i).intValue(); } Vector sv = Vectors.sparse(wordsize, intArray, doubleArray); return new Tuple2<String, Vector>(arg0._1, sv); } }); RowMatrix docwordMatrix = new RowMatrix(doc_vectorRDD.values().rdd()); LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); labeledRowMatrix.wordDocMatrix = docwordMatrix; labeledRowMatrix.words = doc_vectorRDD.keys().collect(); labeledRowMatrix.docs = wordIDRDD.keys().collect(); return labeledRowMatrix; }
From source file:fiji.TemporalMedian.java
License:Open Source License
/** Remove first array of pixels and shift the others to the left. */ final List<float[]> rmFirst(List<float[]> tWinPix, int wmax) { {//from ww w. j a v a 2s . c om int i = 0; i = 0; boolean loop$2 = false; loop$2 = false; SparkConf conf = new SparkConf().setAppName("spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<float[]> rdd_0_2_0 = sc.parallelize(tWinPix); JavaPairRDD<float[], Long> rdd_0_2_1 = rdd_0_2_0.zipWithIndex(); final int wmax_final = wmax; final boolean loop2_final = loop$2; JavaPairRDD<Integer, float[]> mapEmits = rdd_0_2_1 .flatMapToPair(new PairFlatMapFunction<Tuple2<float[], Long>, Integer, float[]>() { public Iterator<Tuple2<Integer, float[]>> call(Tuple2<float[], Long> casper_data_set_i) throws Exception { List<Tuple2<Integer, float[]>> emits = new ArrayList<Tuple2<Integer, float[]>>(); if (casper_data_set_i._2 < wmax_final) emits.add(new Tuple2(casper_data_set_i._2 - 1, casper_data_set_i._1)); return emits.iterator(); } }); Map<Integer, float[]> output_rdd_0_2 = mapEmits.collectAsMap(); for (Integer output_rdd_0_2_k : output_rdd_0_2.keySet()) { tWinPix.set(output_rdd_0_2_k, output_rdd_0_2.get(output_rdd_0_2_k)); } } return tWinPix; }
From source file:fiji.TemporalMedian.java
License:Open Source License
/** Standard deviation of a vector of float values. */ final float calcSD(List<Float> vec) { float sd;/*ww w .j a va2 s .co m*/ sd = 0; float mean; mean = 0; float variance; variance = 0; { SparkConf conf = new SparkConf().setAppName("spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<java.lang.Float> rdd_0_3 = sc.parallelize(vec); JavaPairRDD<Integer, Float> mapEmits = rdd_0_3 .flatMapToPair(new PairFlatMapFunction<java.lang.Float, Integer, Float>() { public Iterator<Tuple2<Integer, Float>> call(java.lang.Float vec_t) throws Exception { List<Tuple2<Integer, Float>> emits = new ArrayList<Tuple2<Integer, Float>>(); emits.add(new Tuple2(1, new Tuple2(1, vec_t))); return emits.iterator(); } }); JavaPairRDD<Integer, Float> reduceEmits = mapEmits.reduceByKey(new Function2<Float, Float, Float>() { public Float call(Float val1, Float val2) throws Exception { return (val1 + val2); } }); Map<Integer, Float> output_rdd_0_3 = reduceEmits.collectAsMap(); mean = output_rdd_0_3.get(1); } float flat$0 = (float) vec.size(); float flat$1 = mean / flat$0; mean = flat$0; { SparkConf conf = new SparkConf().setAppName("spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<java.lang.Float> rdd_0_4 = sc.parallelize(vec); final double mean_final = mean; JavaPairRDD<Integer, Float> mapEmits = rdd_0_4 .flatMapToPair(new PairFlatMapFunction<java.lang.Float, Integer, Float>() { public Iterator<Tuple2<Integer, Float>> call(java.lang.Float vec_i) throws Exception { List<Tuple2<Integer, Float>> emits = new ArrayList<Tuple2<Integer, Float>>(); emits.add(new Tuple2(1, (vec_i - mean_final) * (vec_i - mean_final))); return emits.iterator(); } }); JavaPairRDD<Integer, Float> reduceEmits = mapEmits.reduceByKey(new Function2<Float, Float, Float>() { public Float call(Float val1, Float val2) throws Exception { return (val1 + val2); } }); Map<Integer, Float> output_rdd_0_0 = reduceEmits.collectAsMap(); variance = output_rdd_0_0.get(1); } float flat$2 = (float) vec.size(); float flat$3 = variance / flat$2; variance = flat$3; sd = (float) Math.sqrt(variance); return sd; }
From source file:fiji.Trails.java
License:Open Source License
/** Calculate mean of array of floats. */ final float mean(List<Float> tvec) { float mean = 0; mean = 0;/*from w w w . j a v a 2s. co m*/ { int t = 0; t = 0; boolean loop$3 = false; loop$3 = false; SparkConf conf = new SparkConf().setAppName("spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<java.lang.Float> rdd_0_1 = sc.parallelize(tvec); final boolean loop0_final = loop$3; JavaPairRDD<Integer, Float> mapEmits = rdd_0_1 .flatMapToPair(new PairFlatMapFunction<java.lang.Float, Integer, Float>() { public Iterator<Tuple2<Integer, Float>> call(java.lang.Float tvec_t) throws Exception { List<Tuple2<Integer, Float>> emits = new ArrayList<Tuple2<Integer, Float>>(); emits.add(new Tuple2(1, new Tuple2(1, tvec_t))); return emits.iterator(); } }); JavaPairRDD<Integer, Float> reduceEmits = mapEmits.reduceByKey(new Function2<Float, Float, Float>() { public Float call(Float val1, Float val2) throws Exception { return (val1 + val2); } }); Map<Integer, Float> output_rdd_0_1 = reduceEmits.collectAsMap(); mean = output_rdd_0_1.get(1); } int flat$0 = tvec.size(); float flat$1 = (float) flat$0; float flat$2 = mean / flat$1; return flat$2; }
From source file:fiji.Trails.java
License:Open Source License
/** Remove first array of pixels and shift the others to the left. */ final List<float[]> rmFirst(List<float[]> tWinPix, int wmax) { {//from www. j a v a2s .co m int i = 0; i = 0; boolean loop$4 = false; loop$4 = false; SparkConf conf = new SparkConf().setAppName("spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<float[]> rdd_0_2_0 = sc.parallelize(tWinPix); JavaPairRDD<float[], Long> rdd_0_2_1 = rdd_0_2_0.zipWithIndex(); final int wmax_final = wmax; final boolean loop4_final = loop$4; JavaPairRDD<Integer, float[]> mapEmits = rdd_0_2_1 .flatMapToPair(new PairFlatMapFunction<Tuple2<float[], Long>, Integer, float[]>() { public Iterator<Tuple2<Integer, float[]>> call(Tuple2<float[], Long> casper_data_set_i) throws Exception { List<Tuple2<Integer, float[]>> emits = new ArrayList<Tuple2<Integer, float[]>>(); if (casper_data_set_i._2 < wmax_final) emits.add(new Tuple2(casper_data_set_i._2 - 1, casper_data_set_i._1)); return emits.iterator(); } }); Map<Integer, float[]> output_rdd_0_2 = mapEmits.collectAsMap(); for (Integer output_rdd_0_2_k : output_rdd_0_2.keySet()) { tWinPix.set(output_rdd_0_2_k, output_rdd_0_2.get(output_rdd_0_2_k)); } } return tWinPix; }
From source file:gov.nasa.jpl.mudrod.utils.MatrixUtil.java
License:Apache License
/** * Create matrix from doc-terms JavaPairRDD. * * @param uniqueDocRDD/* www . j a va 2s.c o m*/ * doc-terms JavaPairRDD, in which each key is a doc name, and value * is term list extracted from that doc * @return LabeledRowMatrix {@link LabeledRowMatrix} */ public static LabeledRowMatrix createWordDocMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD) { // Index documents with unique IDs JavaPairRDD<List<String>, Long> corpus = uniqueDocRDD.values().zipWithIndex(); // cal word-doc numbers JavaPairRDD<Tuple2<String, Long>, Double> worddocNumRDD = corpus .flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, Long>, Tuple2<String, Long>, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Tuple2<String, Long>, Double>> call(Tuple2<List<String>, Long> docwords) throws Exception { List<Tuple2<Tuple2<String, Long>, Double>> pairs = new ArrayList<>(); List<String> words = docwords._1; int n = words.size(); for (int i = 0; i < n; i++) { Tuple2<String, Long> worddoc = new Tuple2<>(words.get(i), docwords._2); pairs.add(new Tuple2<Tuple2<String, Long>, Double>(worddoc, 1.0)); } return pairs.iterator(); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double first, Double second) throws Exception { return first + second; } }); // cal word doc-numbers JavaPairRDD<String, Tuple2<List<Long>, List<Double>>> wordDocnumRDD = worddocNumRDD.mapToPair( new PairFunction<Tuple2<Tuple2<String, Long>, Double>, String, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<List<Long>, List<Double>>> call( Tuple2<Tuple2<String, Long>, Double> worddocNum) throws Exception { List<Long> docs = new ArrayList<>(); docs.add(worddocNum._1._2); List<Double> nums = new ArrayList<>(); nums.add(worddocNum._2); Tuple2<List<Long>, List<Double>> docmums = new Tuple2<>(docs, nums); return new Tuple2<>(worddocNum._1._1, docmums); } }); // trans to vector final int corporsize = (int) uniqueDocRDD.keys().count(); JavaPairRDD<String, Vector> wordVectorRDD = wordDocnumRDD.reduceByKey( new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { arg0._1.addAll(arg1._1); arg0._2.addAll(arg1._2); return new Tuple2<>(arg0._1, arg0._2); } }).mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { int docsize = arg0._2._1.size(); int[] intArray = new int[docsize]; double[] doubleArray = new double[docsize]; for (int i = 0; i < docsize; i++) { intArray[i] = arg0._2._1.get(i).intValue(); doubleArray[i] = arg0._2._2.get(i).intValue(); } Vector sv = Vectors.sparse(corporsize, intArray, doubleArray); return new Tuple2<>(arg0._1, sv); } }); RowMatrix wordDocMatrix = new RowMatrix(wordVectorRDD.values().rdd()); LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); labeledRowMatrix.rowMatrix = wordDocMatrix; labeledRowMatrix.rowkeys = wordVectorRDD.keys().collect(); labeledRowMatrix.colkeys = uniqueDocRDD.keys().collect(); return labeledRowMatrix; }
From source file:gov.nasa.jpl.mudrod.utils.MatrixUtil.java
License:Apache License
public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) {//from ww w . j a va 2 s . c om // Index word with unique IDs JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values() .flatMap(new FlatMapFunction<List<String>, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(List<String> arg0) throws Exception { return arg0.iterator(); } }).distinct().zipWithIndex(); // JavaPairRDD<Tuple2<String, String>, Double> docwordNumRDD = uniqueDocRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Tuple2<String, String>, Double>> call( Tuple2<String, List<String>> docwords) throws Exception { List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<>(); List<String> words = docwords._2; int n = words.size(); for (int i = 0; i < n; i++) { Tuple2<String, String> worddoc = new Tuple2<>(docwords._1, words.get(i)); pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0)); } return pairs.iterator(); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double first, Double second) throws Exception { return first + second; } }); // JavaPairRDD<String, Tuple2<String, Double>> wordDocnumRDD = docwordNumRDD.mapToPair( new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0) throws Exception { Tuple2<String, Double> wordmums = new Tuple2<>(arg0._1._1, arg0._2); return new Tuple2<>(arg0._1._2, wordmums); } }); // JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = wordDocnumRDD .leftOuterJoin(wordIDRDD); int wordsize = (int) wordIDRDD.count(); JavaPairRDD<String, Vector> docVectorRDD = testRDD.mapToPair( new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<List<Long>, List<Double>>> call( Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception { Optional<Long> oid = arg0._2._2; Long wordId = (long) 0; if (oid.isPresent()) { wordId = oid.get(); } List<Long> word = new ArrayList<>(); word.add(wordId); List<Double> count = new ArrayList<>(); count.add(arg0._2._1._2); Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<>(word, count); return new Tuple2<>(arg0._2._1._1, wordcount); } }).reduceByKey( new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { arg0._1.addAll(arg1._1); arg0._2.addAll(arg1._2); return new Tuple2<>(arg0._1, arg0._2); } }) .mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { int docsize = arg0._2._1.size(); int[] intArray = new int[docsize]; double[] doubleArray = new double[docsize]; for (int i = 0; i < docsize; i++) { intArray[i] = arg0._2._1.get(i).intValue(); doubleArray[i] = arg0._2._2.get(i).intValue(); } Vector sv = Vectors.sparse(wordsize, intArray, doubleArray); return new Tuple2<>(arg0._1, sv); } }); RowMatrix docwordMatrix = new RowMatrix(docVectorRDD.values().rdd()); LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); labeledRowMatrix.rowMatrix = docwordMatrix; labeledRowMatrix.rowkeys = docVectorRDD.keys().collect(); labeledRowMatrix.colkeys = wordIDRDD.keys().collect(); return labeledRowMatrix; }