List of usage examples for org.apache.spark.api.java Optional isPresent
public boolean isPresent()
From source file:esiptestbed.mudrod.utils.MatrixUtil.java
License:Apache License
public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) {// w ww .java 2s. com // Index word with unique IDs JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values() .flatMap(new FlatMapFunction<List<String>, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(List<String> arg0) throws Exception { return arg0.iterator(); } }).distinct().zipWithIndex(); // JavaPairRDD<Tuple2<String, String>, Double> docword_num_RDD = uniqueDocRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Tuple2<String, String>, Double>> call( Tuple2<String, List<String>> docwords) throws Exception { List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<Tuple2<Tuple2<String, String>, Double>>(); List<String> words = docwords._2; int n = words.size(); for (int i = 0; i < n; i++) { Tuple2<String, String> worddoc = new Tuple2<String, String>(docwords._1, words.get(i)); pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0)); } return pairs.iterator(); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double first, Double second) throws Exception { return first + second; } }); // JavaPairRDD<String, Tuple2<String, Double>> word_docnum_RDD = docword_num_RDD.mapToPair( new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0) throws Exception { Tuple2<String, Double> wordmums = new Tuple2<String, Double>(arg0._1._1, arg0._2); return new Tuple2<String, Tuple2<String, Double>>(arg0._1._2, wordmums); } }); // JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = word_docnum_RDD .leftOuterJoin(wordIDRDD); int wordsize = (int) wordIDRDD.count(); JavaPairRDD<String, Vector> doc_vectorRDD = testRDD.mapToPair( new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<List<Long>, List<Double>>> call( Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception { Optional<Long> oid = arg0._2._2; Long wordId = (long) 0; if (oid.isPresent()) { wordId = oid.get(); } List<Long> word = new ArrayList<Long>(); word.add(wordId); List<Double> count = new ArrayList<Double>(); count.add(arg0._2._1._2); Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<List<Long>, List<Double>>(word, count); return new Tuple2<String, Tuple2<List<Long>, List<Double>>>(arg0._2._1._1, wordcount); } }).reduceByKey( new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { arg0._1.addAll(arg1._1); arg0._2.addAll(arg1._2); return new Tuple2<List<Long>, List<Double>>(arg0._1, arg0._2); } }) .mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { int docsize = arg0._2._1.size(); int[] intArray = new int[docsize]; double[] doubleArray = new double[docsize]; for (int i = 0; i < docsize; i++) { intArray[i] = arg0._2._1.get(i).intValue(); doubleArray[i] = arg0._2._2.get(i).intValue(); } Vector sv = Vectors.sparse(wordsize, intArray, doubleArray); return new Tuple2<String, Vector>(arg0._1, sv); } }); RowMatrix docwordMatrix = new RowMatrix(doc_vectorRDD.values().rdd()); LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); labeledRowMatrix.wordDocMatrix = docwordMatrix; labeledRowMatrix.words = doc_vectorRDD.keys().collect(); labeledRowMatrix.docs = wordIDRDD.keys().collect(); return labeledRowMatrix; }
From source file:esiptestbed.mudrod.utils.SimilarityUtil.java
License:Apache License
/** * MatrixtoTriples:Convert term similarity matrix to linkage triple list. * * * @param keys/*from w w w . j a v a2s . c o m*/ * each key is a term * @param simMatirx * term similarity matrix, in which each row and column is a term and * the cell value is the similarity between the two terms * @return linkage triple list */ public static List<LinkageTriple> MatrixtoTriples(JavaRDD<String> keys, CoordinateMatrix simMatirx) { if (simMatirx.numCols() != keys.count()) { return null; } // index words JavaPairRDD<Long, String> keyIdRDD = JavaPairRDD .fromJavaRDD(keys.zipWithIndex().map(new Function<Tuple2<String, Long>, Tuple2<Long, String>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<Long, String> call(Tuple2<String, Long> doc_id) { return doc_id.swap(); } })); JavaPairRDD<Long, LinkageTriple> entries_rowRDD = simMatirx.entries().toJavaRDD() .mapToPair(new PairFunction<MatrixEntry, Long, LinkageTriple>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<Long, LinkageTriple> call(MatrixEntry t) throws Exception { LinkageTriple triple = new LinkageTriple(); triple.keyAId = t.i(); triple.keyBId = t.j(); triple.weight = t.value(); return new Tuple2<Long, LinkageTriple>(triple.keyAId, triple); } }); JavaPairRDD<Long, LinkageTriple> entries_colRDD = entries_rowRDD.leftOuterJoin(keyIdRDD).values() .mapToPair(new PairFunction<Tuple2<LinkageTriple, Optional<String>>, Long, LinkageTriple>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<Long, LinkageTriple> call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception { LinkageTriple triple = t._1; Optional<String> stra = t._2; if (stra.isPresent()) { triple.keyA = stra.get(); } return new Tuple2<Long, LinkageTriple>(triple.keyBId, triple); } }); JavaRDD<LinkageTriple> tripleRDD = entries_colRDD.leftOuterJoin(keyIdRDD).values() .map(new Function<Tuple2<LinkageTriple, Optional<String>>, LinkageTriple>() { /** * */ private static final long serialVersionUID = 1L; @Override public LinkageTriple call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception { LinkageTriple triple = t._1; Optional<String> strb = t._2; if (strb.isPresent()) { triple.keyB = strb.get(); } return triple; } }); List<LinkageTriple> triples = tripleRDD.collect(); return triples; }
From source file:esiptestbed.mudrod.weblog.structure.SessionExtractor.java
License:Apache License
public JavaPairRDD<String, Double> bulidSessionItermRDD(JavaRDD<ClickStream> clickstreamRDD, int filterOpt) { JavaPairRDD<String, String> sessionItemRDD = clickstreamRDD .mapToPair(new PairFunction<ClickStream, String, String>() { /**/*from w ww . ja v a 2 s. co m*/ * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, String> call(ClickStream click) throws Exception { String sessionID = click.getSessionID(); return new Tuple2<String, String>(sessionID, click.getViewDataset()); } }).distinct(); // remove some sessions JavaPairRDD<String, Double> sessionItemNumRDD = sessionItemRDD.keys() .mapToPair(new PairFunction<String, String, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Double> call(String item) throws Exception { return new Tuple2<String, Double>(item, 1.0); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double v1, Double v2) throws Exception { return v1 + v2; } }).filter(new Function<Tuple2<String, Double>, Boolean>() { /** * */ private static final long serialVersionUID = 1L; @Override public Boolean call(Tuple2<String, Double> arg0) throws Exception { Boolean b = true; if (arg0._2 < 2) { b = false; } return b; } }); JavaPairRDD<String, Double> filteredSessionItemRDD = sessionItemNumRDD.leftOuterJoin(sessionItemRDD) .mapToPair(new PairFunction<Tuple2<String, Tuple2<Double, Optional<String>>>, String, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Double> call(Tuple2<String, Tuple2<Double, Optional<String>>> arg0) throws Exception { Tuple2<Double, Optional<String>> test = arg0._2; Optional<String> optStr = test._2; String item = ""; if (optStr.isPresent()) { item = optStr.get(); } return new Tuple2<String, Double>(arg0._1 + "," + item, 1.0); } }); return filteredSessionItemRDD; }
From source file:gov.nasa.jpl.mudrod.utils.MatrixUtil.java
License:Apache License
public static LabeledRowMatrix createDocWordMatrix(JavaPairRDD<String, List<String>> uniqueDocRDD, JavaSparkContext sc) {//from ww w . ja v a2s . co m // Index word with unique IDs JavaPairRDD<String, Long> wordIDRDD = uniqueDocRDD.values() .flatMap(new FlatMapFunction<List<String>, String>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<String> call(List<String> arg0) throws Exception { return arg0.iterator(); } }).distinct().zipWithIndex(); // JavaPairRDD<Tuple2<String, String>, Double> docwordNumRDD = uniqueDocRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, List<String>>, Tuple2<String, String>, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Tuple2<String, String>, Double>> call( Tuple2<String, List<String>> docwords) throws Exception { List<Tuple2<Tuple2<String, String>, Double>> pairs = new ArrayList<>(); List<String> words = docwords._2; int n = words.size(); for (int i = 0; i < n; i++) { Tuple2<String, String> worddoc = new Tuple2<>(docwords._1, words.get(i)); pairs.add(new Tuple2<Tuple2<String, String>, Double>(worddoc, 1.0)); } return pairs.iterator(); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double first, Double second) throws Exception { return first + second; } }); // JavaPairRDD<String, Tuple2<String, Double>> wordDocnumRDD = docwordNumRDD.mapToPair( new PairFunction<Tuple2<Tuple2<String, String>, Double>, String, Tuple2<String, Double>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<String, Double>> call(Tuple2<Tuple2<String, String>, Double> arg0) throws Exception { Tuple2<String, Double> wordmums = new Tuple2<>(arg0._1._1, arg0._2); return new Tuple2<>(arg0._1._2, wordmums); } }); // JavaPairRDD<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> testRDD = wordDocnumRDD .leftOuterJoin(wordIDRDD); int wordsize = (int) wordIDRDD.count(); JavaPairRDD<String, Vector> docVectorRDD = testRDD.mapToPair( new PairFunction<Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>>, String, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Tuple2<List<Long>, List<Double>>> call( Tuple2<String, Tuple2<Tuple2<String, Double>, Optional<Long>>> arg0) throws Exception { Optional<Long> oid = arg0._2._2; Long wordId = (long) 0; if (oid.isPresent()) { wordId = oid.get(); } List<Long> word = new ArrayList<>(); word.add(wordId); List<Double> count = new ArrayList<>(); count.add(arg0._2._1._2); Tuple2<List<Long>, List<Double>> wordcount = new Tuple2<>(word, count); return new Tuple2<>(arg0._2._1._1, wordcount); } }).reduceByKey( new Function2<Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>, Tuple2<List<Long>, List<Double>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<List<Long>, List<Double>> call(Tuple2<List<Long>, List<Double>> arg0, Tuple2<List<Long>, List<Double>> arg1) throws Exception { arg0._1.addAll(arg1._1); arg0._2.addAll(arg1._2); return new Tuple2<>(arg0._1, arg0._2); } }) .mapToPair(new PairFunction<Tuple2<String, Tuple2<List<Long>, List<Double>>>, String, Vector>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Vector> call(Tuple2<String, Tuple2<List<Long>, List<Double>>> arg0) throws Exception { int docsize = arg0._2._1.size(); int[] intArray = new int[docsize]; double[] doubleArray = new double[docsize]; for (int i = 0; i < docsize; i++) { intArray[i] = arg0._2._1.get(i).intValue(); doubleArray[i] = arg0._2._2.get(i).intValue(); } Vector sv = Vectors.sparse(wordsize, intArray, doubleArray); return new Tuple2<>(arg0._1, sv); } }); RowMatrix docwordMatrix = new RowMatrix(docVectorRDD.values().rdd()); LabeledRowMatrix labeledRowMatrix = new LabeledRowMatrix(); labeledRowMatrix.rowMatrix = docwordMatrix; labeledRowMatrix.rowkeys = docVectorRDD.keys().collect(); labeledRowMatrix.colkeys = wordIDRDD.keys().collect(); return labeledRowMatrix; }
From source file:gov.nasa.jpl.mudrod.utils.SimilarityUtil.java
License:Apache License
/** * MatrixtoTriples:Convert term similarity matrix to linkage triple list. * * @param keys each key is a term// w w w .ja va 2s . co m * @param simMatirx term similarity matrix, in which each row and column is a term and * the cell value is the similarity between the two terms * @return linkage triple list */ public static List<LinkageTriple> matrixToTriples(JavaRDD<String> keys, CoordinateMatrix simMatirx) { if (simMatirx.numCols() != keys.count()) { return null; } // index words JavaPairRDD<Long, String> keyIdRDD = JavaPairRDD .fromJavaRDD(keys.zipWithIndex().map(new Function<Tuple2<String, Long>, Tuple2<Long, String>>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<Long, String> call(Tuple2<String, Long> docId) { return docId.swap(); } })); JavaPairRDD<Long, LinkageTriple> entriesRowRDD = simMatirx.entries().toJavaRDD() .mapToPair(new PairFunction<MatrixEntry, Long, LinkageTriple>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<Long, LinkageTriple> call(MatrixEntry t) throws Exception { LinkageTriple triple = new LinkageTriple(); triple.keyAId = t.i(); triple.keyBId = t.j(); triple.weight = t.value(); return new Tuple2<>(triple.keyAId, triple); } }); JavaPairRDD<Long, LinkageTriple> entriesColRDD = entriesRowRDD.leftOuterJoin(keyIdRDD).values() .mapToPair(new PairFunction<Tuple2<LinkageTriple, Optional<String>>, Long, LinkageTriple>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<Long, LinkageTriple> call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception { LinkageTriple triple = t._1; Optional<String> stra = t._2; if (stra.isPresent()) { triple.keyA = stra.get(); } return new Tuple2<>(triple.keyBId, triple); } }); JavaRDD<LinkageTriple> tripleRDD = entriesColRDD.leftOuterJoin(keyIdRDD).values() .map(new Function<Tuple2<LinkageTriple, Optional<String>>, LinkageTriple>() { /** * */ private static final long serialVersionUID = 1L; @Override public LinkageTriple call(Tuple2<LinkageTriple, Optional<String>> t) throws Exception { LinkageTriple triple = t._1; Optional<String> strb = t._2; if (strb.isPresent()) { triple.keyB = strb.get(); } return triple; } }); return tripleRDD.collect(); }
From source file:gov.nasa.jpl.mudrod.weblog.structure.SessionExtractor.java
License:Apache License
public JavaPairRDD<String, Double> bulidSessionItermRDD(JavaRDD<ClickStream> clickstreamRDD) { JavaPairRDD<String, String> sessionItemRDD = clickstreamRDD .mapToPair(new PairFunction<ClickStream, String, String>() { /**//ww w . j a v a 2s . c o m * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, String> call(ClickStream click) throws Exception { String sessionID = click.getSessionID(); return new Tuple2<>(sessionID, click.getViewDataset()); } }).distinct(); // remove some sessions JavaPairRDD<String, Double> sessionItemNumRDD = sessionItemRDD.keys() .mapToPair(new PairFunction<String, String, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Double> call(String item) throws Exception { return new Tuple2<>(item, 1.0); } }).reduceByKey(new Function2<Double, Double, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Double call(Double v1, Double v2) throws Exception { return v1 + v2; } }).filter(new Function<Tuple2<String, Double>, Boolean>() { /** * */ private static final long serialVersionUID = 1L; @Override public Boolean call(Tuple2<String, Double> arg0) throws Exception { Boolean b = true; if (arg0._2 < 2) { b = false; } return b; } }); return sessionItemNumRDD.leftOuterJoin(sessionItemRDD) .mapToPair(new PairFunction<Tuple2<String, Tuple2<Double, Optional<String>>>, String, Double>() { /** * */ private static final long serialVersionUID = 1L; @Override public Tuple2<String, Double> call(Tuple2<String, Tuple2<Double, Optional<String>>> arg0) throws Exception { Tuple2<Double, Optional<String>> test = arg0._2; Optional<String> optStr = test._2; String item = ""; if (optStr.isPresent()) { item = optStr.get(); } return new Tuple2<>(arg0._1 + "," + item, 1.0); } }); }