List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction
PairFlatMapFunction
From source file:info.debatty.spark.knngraphs.builder.DoubleApproximate.java
License:Open Source License
@Override protected JavaPairRDD<Integer, Node<SparseDoubleVector>> _binNodes(JavaRDD<Node<SparseDoubleVector>> nodes) throws Exception { final long count = nodes.count(); lsh = new info.debatty.java.lsh.LSHSuperBit(stages, buckets, this.dim); return nodes.flatMapToPair( new PairFlatMapFunction<Node<SparseDoubleVector>, Integer, Node<SparseDoubleVector>>() { public Iterable<Tuple2<Integer, Node<SparseDoubleVector>>> call(Node<SparseDoubleVector> n) throws Exception { ArrayList<Tuple2<Integer, Node<SparseDoubleVector>>> r = new ArrayList<Tuple2<Integer, Node<SparseDoubleVector>>>(); int[] hash = lsh.hash(n.value); // Downsample vectors using DIMSUM n.value.sampleDIMSUM(0.5, (int) count, dim); for (int v : hash) { r.add(new Tuple2<Integer, Node<SparseDoubleVector>>(v, n)); }// w w w . j a v a 2 s . c om return r; } }); }
From source file:info.debatty.spark.knngraphs.builder.NNCTPH.java
License:Open Source License
@Override protected JavaPairRDD<Integer, Node<String>> _binNodes(JavaRDD<Node<String>> nodes) { return nodes.flatMapToPair(new PairFlatMapFunction<Node<String>, Integer, Node<String>>() { public Iterable<Tuple2<Integer, Node<String>>> call(Node<String> n) throws Exception { ESSum ess = new ESSum(stages, buckets, 1); ArrayList<Tuple2<Integer, Node<String>>> r = new ArrayList<Tuple2<Integer, Node<String>>>(); int[] hash = ess.HashString(n.value); for (int v : hash) { r.add(new Tuple2<Integer, Node<String>>(v, n)); }/*from w w w . j av a 2 s. c om*/ return r; } }); }
From source file:it.unitn.spark.examples.bigdata2015.JavaPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);/*from w ww . j a v a 2 s. c om*/ } showWarning(); SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank").setMaster("local[2]"); ; JavaSparkContext sc = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... JavaRDD<String> lines = sc.textFile(args[0], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and // initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank // algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results.iterator(); } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } sc.stop(); sc.close(); }
From source file:org.apache.blur.spark.BlurMRBulkLoadSparkProcessor.java
License:Apache License
@Override protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() { return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() { @Override//w ww. j a va 2 s. com public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception { // Blur Table Details Iface client = getBlurClient(); TableDescriptor tableDescriptor = client.describe(getBlurTableName()); Configuration conf = new Configuration(); // Blur specific Configuration conf.setClass(MAPREDUCE_PARTITIONER_CLASS, BlurPartitioner.class, Partitioner.class); conf.set(MAPRED_OUTPUT_COMMITTER_CLASS, BlurOutputCommitter.class.getName()); // Partition RDD to match Blur Table Shard Count. Used Custom // Partitioner to channel correct BlurMutate to correct Shard. BlurSparkPartitioner blurSparkPartitioner = new BlurSparkPartitioner( tableDescriptor.getShardCount()); JavaPairRDD<Text, BlurMutate> flatMapToPair = rdd .flatMapToPair(new PairFlatMapFunction<Tuple2<String, RowMutation>, Text, BlurMutate>() { @Override public Iterable<Tuple2<Text, BlurMutate>> call(Tuple2<String, RowMutation> tuple2) throws Exception { RowMutation rowMutation = tuple2._2; final List<BlurMutate> result = new ArrayList<BlurMutate>(); List<RecordMutation> recordMutations = rowMutation.getRecordMutations(); String rowId = rowMutation.getRowId(); for (RecordMutation recordMutation : recordMutations) { Record record = recordMutation.getRecord(); String family = record.getFamily(); String recordId = record.getRecordId(); List<BlurColumn> columns = toColumns(record.getColumns()); BlurRecord blurRecord = new BlurRecord(); blurRecord.setRowId(rowId); blurRecord.setFamily(family); blurRecord.setRecordId(recordId); blurRecord.setColumns(columns); result.add(new BlurMutate(MUTATE_TYPE.REPLACE, blurRecord)); } return new Iterable<Tuple2<Text, BlurMutate>>() { @Override public Iterator<Tuple2<Text, BlurMutate>> iterator() { final Iterator<BlurMutate> iterator = result.iterator(); return new Iterator<Tuple2<Text, BlurMutate>>() { @Override public boolean hasNext() { return iterator.hasNext(); } @Override public Tuple2<Text, BlurMutate> next() { BlurMutate blurMutate = iterator.next(); return new Tuple2<Text, BlurMutate>( new Text(blurMutate.getRecord().getRowId()), blurMutate); } @Override public void remove() { } }; } }; } private List<BlurColumn> toColumns(List<Column> columns) { List<BlurColumn> cols = new ArrayList<BlurColumn>(); for (Column column : columns) { cols.add(new BlurColumn(column.getName(), column.getValue())); } return cols; } }); final JavaPairRDD<Text, BlurMutate> pRdd = flatMapToPair.partitionBy(blurSparkPartitioner) .persist(getStorageLevel()); Job job = new Job(conf); BlurOutputFormat.setupJob(job, tableDescriptor); Path path = new Path(getOutputPath()); FileSystem fileSystem = path.getFileSystem(conf); Path qualified = fileSystem.makeQualified(path); BlurOutputFormat.setOutputPath(job, qualified); setupBlurHadoopConfig(job.getConfiguration()); // Write the RDD to Blur Table if (pRdd.count() > 0) { pRdd.saveAsNewAPIHadoopFile(tableDescriptor.getTableUri(), Text.class, BlurMutate.class, BlurOutputFormat.class, job.getConfiguration()); client.loadData(getBlurTableName(), qualified.toString()); } return null; } }; }
From source file:org.apache.kylin.engine.spark.SparkCubing.java
License:Apache License
/** return hfile location */ private String build(JavaRDD<List<String>> javaRDD, final String cubeName, final String segmentId, final byte[][] splitKeys) throws Exception { CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName); CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); List<TblColRef> baseCuboidColumn = Cuboid.findById(cubeDesc, Cuboid.getBaseCuboidId(cubeDesc)).getColumns(); final Map<TblColRef, Integer> columnLengthMap = Maps.newHashMap(); final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap(); for (TblColRef tblColRef : baseCuboidColumn) { columnLengthMap.put(tblColRef, dimEncMap.get(tblColRef).getLengthOfEncoding()); }/* w w w. ja va2 s. c om*/ final Map<TblColRef, Dictionary<String>> dictionaryMap = Maps.newHashMap(); for (DimensionDesc dim : cubeDesc.getDimensions()) { // dictionary for (TblColRef col : dim.getColumnRefs()) { if (cubeDesc.getRowkey().isUseDictionary(col)) { Dictionary<String> dict = cubeSegment.getDictionary(col); if (dict == null) { System.err.println("Dictionary for " + col + " was not found."); continue; } dictionaryMap.put(col, dict); System.out.println("col:" + col + " dictionary size:" + dict.getSize()); } } } for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { FunctionDesc func = measureDesc.getFunction(); List<TblColRef> colRefs = func.getMeasureType().getColumnsNeedDictionary(func); for (TblColRef col : colRefs) { dictionaryMap.put(col, cubeSegment.getDictionary(col)); } } final JavaPairRDD<byte[], byte[]> javaPairRDD = javaRDD.glom() .mapPartitionsToPair(new PairFlatMapFunction<Iterator<List<List<String>>>, byte[], byte[]>() { @Override public Iterable<Tuple2<byte[], byte[]>> call(Iterator<List<List<String>>> listIterator) throws Exception { long t = System.currentTimeMillis(); prepare(); final CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()) .getCube(cubeName); LinkedBlockingQueue<List<String>> blockingQueue = new LinkedBlockingQueue(); System.out.println("load properties finished"); IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(cubeSegment); AbstractInMemCubeBuilder inMemCubeBuilder = new DoggedCubeBuilder( cubeInstance.getDescriptor(), flatDesc, dictionaryMap); final SparkCuboidWriter sparkCuboidWriter = new BufferedCuboidWriter( new DefaultTupleConverter(cubeInstance.getSegmentById(segmentId), columnLengthMap)); Executors.newCachedThreadPool() .submit(inMemCubeBuilder.buildAsRunnable(blockingQueue, sparkCuboidWriter)); try { while (listIterator.hasNext()) { for (List<String> row : listIterator.next()) { blockingQueue.put(row); } } blockingQueue.put(Collections.<String>emptyList()); } catch (Exception e) { throw new RuntimeException(e); } System.out.println("build partition cost: " + (System.currentTimeMillis() - t) + "ms"); return sparkCuboidWriter.getResult(); } }); KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv(); Configuration conf = getConfigurationForHFile(cubeSegment.getStorageLocationIdentifier()); Path path = new Path(kylinConfig.getHdfsWorkingDirectory(), "hfile_" + UUID.randomUUID().toString()); Preconditions.checkArgument(!FileSystem.get(conf).exists(path)); String url = conf.get("fs.defaultFS") + path.toString(); System.out.println("use " + url + " as hfile"); List<MeasureDesc> measuresDescs = cubeDesc.getMeasures(); final int measureSize = measuresDescs.size(); final String[] dataTypes = new String[measureSize]; for (int i = 0; i < dataTypes.length; i++) { dataTypes[i] = measuresDescs.get(i).getFunction().getReturnType(); } final MeasureAggregators aggs = new MeasureAggregators(measuresDescs); writeToHFile2(javaPairRDD, dataTypes, measureSize, aggs, splitKeys, conf, url); return url; }
From source file:org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java
License:Apache License
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); final String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH)); final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH); final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"), KeyValueCreator.class, KeyValue.class, RowKeyWritable.class }; SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration()); if (!fs.exists(partitionFilePath)) { throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString()); }// w w w. j a v a 2 s . co m HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); final MeasureCodec inputCodec = new MeasureCodec(cubeDesc.getMeasures()); final List<KeyValueCreator> keyValueCreators = Lists.newArrayList(); for (HBaseColumnFamilyDesc cfDesc : cubeDesc.getHbaseMapping().getColumnFamily()) { for (HBaseColumnDesc colDesc : cfDesc.getColumns()) { keyValueCreators.add(new KeyValueCreator(cubeDesc, colDesc)); } } final int cfNum = keyValueCreators.size(); final boolean quickPath = (keyValueCreators.size() == 1) && keyValueCreators.get(0).isFullCopy; logger.info("Input path: {}", inputPath); logger.info("Output path: {}", outputPath); // read partition split keys List<RowKeyWritable> keys = new ArrayList<>(); try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath, sc.hadoopConfiguration())) { RowKeyWritable key = new RowKeyWritable(); Writable value = NullWritable.get(); while (reader.next(key, value)) { keys.add(key); logger.info(" ------- split key: {}", key); key = new RowKeyWritable(); // important, new an object! } } logger.info("There are {} split keys, totally {} hfiles", keys.size(), (keys.size() + 1)); //HBase conf logger.info("Loading HBase configuration from:{}", hbaseConfFile); final Path hbaseConfFilePath = new Path(hbaseConfFile); final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration()); try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) { Configuration hbaseJobConf = new Configuration(); hbaseJobConf.addResource(confInput); hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3 Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier()); FileOutputFormat.setOutputPath(job, new Path(outputPath)); // inputPath has the same FileSystem as hbaseClusterFs when in HBase standalone mode JavaPairRDD<Text, Text> inputRDDs = SparkUtil.parseInputPath(inputPath, hbaseClusterFs, sc, Text.class, Text.class); final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd; if (quickPath) { hfilerdd = inputRDDs .mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() { @Override public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2) throws Exception { KeyValue outputValue = keyValueCreators.get(0).create(textTextTuple2._1, textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength()); return new Tuple2<>( new RowKeyWritable(outputValue.createKeyOnly(false).getKey()), outputValue); } }); } else { hfilerdd = inputRDDs .flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() { @Override public Iterator<Tuple2<RowKeyWritable, KeyValue>> call( Tuple2<Text, Text> textTextTuple2) throws Exception { List<Tuple2<RowKeyWritable, KeyValue>> result = Lists .newArrayListWithExpectedSize(cfNum); Object[] inputMeasures = new Object[cubeDesc.getMeasures().size()]; inputCodec.decode(ByteBuffer.wrap(textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength()), inputMeasures); for (int i = 0; i < cfNum; i++) { KeyValue outputValue = keyValueCreators.get(i).create(textTextTuple2._1, inputMeasures); result.add(new Tuple2<>( new RowKeyWritable(outputValue.createKeyOnly(false).getKey()), outputValue)); } return result.iterator(); } }); } hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys), RowKeyWritable.RowKeyComparator.INSTANCE) .mapToPair( new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() { @Override public Tuple2<ImmutableBytesWritable, KeyValue> call( Tuple2<RowKeyWritable, KeyValue> rowKeyWritableKeyValueTuple2) throws Exception { return new Tuple2<>( new ImmutableBytesWritable( rowKeyWritableKeyValueTuple2._2.getKey()), rowKeyWritableKeyValueTuple2._2); } }) .saveAsNewAPIHadoopDataset(job.getConfiguration()); } logger.info("HDFS: Number of bytes written={}", jobListener.metrics.getBytesWritten()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.HDFS_BYTES_WRITTEN, String.valueOf(jobListener.metrics.getBytesWritten())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); } }
From source file:org.biocaddie.citationanalysis.metrics.JavaPageRank.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);//from w ww . j a v a 2 s . c o m } showWarning(); JavaSparkContext ctx = SparkUtils.getJavaSparkContext("JavaPageRank"); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... // JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> lines = ctx.textFile(args[0]); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<String, String>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { @Override public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } ctx.stop(); }
From source file:org.biocaddie.citationanalysis.metrics.JavaPageRankInt.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);/*from w w w. java 2 s .c om*/ } double alpha = 0.5; JavaSparkContext ctx = SparkUtils.getJavaSparkContext("JavaPageRank"); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... // JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> lines = ctx.textFile(args[0]); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<Integer, Iterable<Integer>> links = lines .mapToPair(new PairFunction<String, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<Integer, Integer>(Integer.parseInt(parts[0]), Integer.parseInt(parts[1])); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<Integer, Double> ranks = links.mapValues(new Function<Iterable<Integer>, Double>() { @Override public Double call(Iterable<Integer> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[2]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<Integer, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<Integer>, Double>, Integer, Double>() { @Override public Iterable<Tuple2<Integer, Double>> call(Tuple2<Iterable<Integer>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<Integer, Double>> results = new ArrayList<Tuple2<Integer, Double>>(); for (Integer n : s._1) { results.add(new Tuple2<Integer, Double>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return alpha + sum * 1 - alpha; // return 0.15 + sum * 0.85; } }); } JavaRDD<String> idLines = ctx.textFile(args[1]); JavaPairRDD<Integer, Integer> pmIds = idLines.mapToPair(new PairFunction<String, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(String s) { String[] parts = s.split(","); return new Tuple2<Integer, Integer>(Integer.parseInt(parts[0]), Integer.parseInt(parts[1])); } }); ranks = ranks.filter(t -> t._2 > 0.8); JavaPairRDD<Integer, Tuple2<Double, Integer>> join = ranks.join(pmIds); List<Tuple2<Integer, Tuple2<Double, Integer>>> collect = join.collect(); for (Tuple2<Integer, Tuple2<Double, Integer>> t : collect) { System.out.println(t._1 + "," + t._2._2 + "," + t._2._1); } // Collects all URL ranks and dump them to console. // List<Tuple2<Integer, Double>> output = ranks.collect(); // for (Tuple2<?,?> tuple : output) { // System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); // } ctx.stop(); }
From source file:org.kaaproject.examples.spark.KaaSparkExample.java
License:Apache License
@SuppressWarnings("serial") public static void main(String[] args) throws Exception { // Initializing Spark streaming context JavaStreamingContext ssc = new JavaStreamingContext(new JavaSparkContext(new SparkConf()), BATCH_DURATION); // Creating Flume stream to consume the data LOG.info("Binding flume stream to {}:{}", args[0], args[1]); JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, args[0], Integer.parseInt(args[1])); // Decode and map incoming events to <ZoneID, ZoneStats> pairs JavaPairDStream<Integer, ZoneStats> zoneVoltageDstream = flumeStream .flatMapToPair(new PairFlatMapFunction<SparkFlumeEvent, Integer, ZoneStats>() { @Override//from w ww . j av a 2s . co m public Iterable<Tuple2<Integer, ZoneStats>> call(SparkFlumeEvent sparkFlumeEvent) throws Exception { List<Tuple2<Integer, ZoneStats>> results = new ArrayList<Tuple2<Integer, ZoneStats>>(); // Iterating through each event for (PowerReport record : reader.decodeRecords(sparkFlumeEvent.event().getBody())) { LOG.info("Parsed record: {}", record); // Iterating through per panel samples for (PowerSample sample : record.getSamples()) { results.add(new Tuple2<Integer, ZoneStats>(sample.getZoneId(), new ZoneStats(1, sample.getPower()))); } } LOG.info("Event parsed."); return results; } }); // Apply simple reduce function to all <ZoneID, ZoneStats> pairs in // order to calculate average and total power produced in each zone. zoneVoltageDstream.reduceByKey( new Function2<KaaSparkExample.ZoneStats, KaaSparkExample.ZoneStats, KaaSparkExample.ZoneStats>() { // Simple reduce function that calculates total panel count // and // total power produced in scope of each zone. @Override public ZoneStats call(ZoneStats v1, ZoneStats v2) throws Exception { return new ZoneStats(v1.panelCount + v2.panelCount, v1.powerSum + v2.powerSum); } // Map results to string for pretty output }) .transformToPair(new Function<JavaPairRDD<Integer, ZoneStats>, JavaPairRDD<Integer, ZoneStats>>() { @Override public JavaPairRDD<Integer, ZoneStats> call(JavaPairRDD<Integer, ZoneStats> v1) throws Exception { return v1.sortByKey(); } }).map(new Function<Tuple2<Integer, ZoneStats>, String>() { @Override public String call(Tuple2<Integer, ZoneStats> tuple) throws Exception { StringBuilder sb = new StringBuilder(); sb.append("Zone ").append(tuple._1()).append(": "); sb.append("Total power ").append(tuple._2().getTotalPower()).append(" collected from ") .append(tuple._2().getPanelCount()).append(" panels. "); sb.append("Average power produced by each panel is ").append(tuple._2().getAvgPower()); return sb.toString(); } }).print(); // Start streaming application ssc.start(); // Block until terminated ssc.awaitTermination(); }
From source file:org.kitesdk.examples.spark.CorrelateEventsTask.java
License:Apache License
public void run() throws IOException { Configuration conf = new Configuration(); DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class); DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class); // Create our Spark configuration and get a Java context SparkConf sparkConf = new SparkConf().setAppName("Correlate Events") // Configure the use of Kryo serialization including our Avro registrator .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.kryo.registrator", "org.kitesdk.examples.spark.AvroKyroRegistrator"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaPairRDD<StandardEvent, Void> events = sparkContext.newAPIHadoopRDD(conf, DatasetKeyInputFormat.class, StandardEvent.class, Void.class); // Map each event to two correlation keys. One with the IP address and the // nearest 5 minute interval that happened before the event and one with the // IP address and the nearest 5 minute interval that happened after the event JavaPairRDD<CorrelationKey, StandardEvent> mappedEvents = events.flatMapToPair( new PairFlatMapFunction<Tuple2<StandardEvent, Void>, CorrelationKey, StandardEvent>() { @Override/*from w w w . java 2 s . co m*/ public Iterable<Tuple2<CorrelationKey, StandardEvent>> call(Tuple2<StandardEvent, Void> t) throws Exception { List<Tuple2<CorrelationKey, StandardEvent>> result = new ArrayList<Tuple2<CorrelationKey, StandardEvent>>( 2); StandardEvent event = t._1(); long loTimestamp = createLoTimestamp(event.getTimestamp()); long hiTimestamp = createHiTimestamp(event.getTimestamp()); String ip = event.getIp().toString(); result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(loTimestamp, ip), event)); result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(hiTimestamp, ip), event)); return result; } }); // Group the events by they correlation key JavaPairRDD<CorrelationKey, Iterable<StandardEvent>> groupedEvents = mappedEvents.groupByKey(); // Generate potential matches by creating a list of alerts along with the // matched list of clicks. If no alerts were found with this correlation // key, then output an empty pair JavaPairRDD<List<StandardEvent>, List<StandardEvent>> potentialMatches = groupedEvents.mapToPair( new PairFunction<Tuple2<CorrelationKey, Iterable<StandardEvent>>, List<StandardEvent>, List<StandardEvent>>() { @Override public Tuple2<List<StandardEvent>, List<StandardEvent>> call( Tuple2<CorrelationKey, Iterable<StandardEvent>> t) throws Exception { Iterable<StandardEvent> allEvents = t._2(); List<StandardEvent> alerts = new ArrayList<StandardEvent>(); List<StandardEvent> clicks = new ArrayList<StandardEvent>(); for (StandardEvent event : allEvents) { if (event.getEventDetails() != null && event.getEventDetails().containsKey(new Utf8("type")) && "alert".equals(event.getEventDetails().get(new Utf8("type")).toString())) { alerts.add(event); } else if (event.getEventDetails() != null && event.getEventDetails().containsKey(new Utf8("type")) && "click".equals(event.getEventDetails().get(new Utf8("type")).toString())) { clicks.add(event); } } if (alerts.isEmpty()) { return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, alerts); } else { return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, clicks); } } }); // Verify that the matched events are true matches (i.e. the timestamps // are really less than or equal to 5 minutes apart JavaPairRDD<CorrelatedEvents, Void> matches = potentialMatches.flatMapToPair( new PairFlatMapFunction<Tuple2<List<StandardEvent>, List<StandardEvent>>, CorrelatedEvents, Void>() { @Override public Iterable<Tuple2<CorrelatedEvents, Void>> call( Tuple2<List<StandardEvent>, List<StandardEvent>> t) throws Exception { List<Tuple2<CorrelatedEvents, Void>> results = new ArrayList<Tuple2<CorrelatedEvents, Void>>(); List<StandardEvent> alerts = t._1(); List<StandardEvent> clicks = t._2(); for (StandardEvent alert : alerts) { List<StandardEvent> correlated = new ArrayList<StandardEvent>(); for (StandardEvent click : clicks) { if (Math.abs(alert.getTimestamp() - click.getTimestamp()) <= FIVE_MIN_MILLIS) { correlated.add(click); } } if (!correlated.isEmpty()) { results.add(new Tuple2(CorrelatedEvents.newBuilder().setEvent(alert) .setCorrelated(correlated).build(), null)); } } return results; } }); // Write the data to a Kite dataset matches.saveAsNewAPIHadoopFile("dummy", CorrelatedEvents.class, Void.class, DatasetKeyOutputFormat.class, conf); }