Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:info.debatty.spark.knngraphs.builder.DoubleApproximate.java

License:Open Source License

@Override
protected JavaPairRDD<Integer, Node<SparseDoubleVector>> _binNodes(JavaRDD<Node<SparseDoubleVector>> nodes)
        throws Exception {
    final long count = nodes.count();
    lsh = new info.debatty.java.lsh.LSHSuperBit(stages, buckets, this.dim);

    return nodes.flatMapToPair(
            new PairFlatMapFunction<Node<SparseDoubleVector>, Integer, Node<SparseDoubleVector>>() {

                public Iterable<Tuple2<Integer, Node<SparseDoubleVector>>> call(Node<SparseDoubleVector> n)
                        throws Exception {
                    ArrayList<Tuple2<Integer, Node<SparseDoubleVector>>> r = new ArrayList<Tuple2<Integer, Node<SparseDoubleVector>>>();
                    int[] hash = lsh.hash(n.value);

                    // Downsample vectors using DIMSUM
                    n.value.sampleDIMSUM(0.5, (int) count, dim);

                    for (int v : hash) {
                        r.add(new Tuple2<Integer, Node<SparseDoubleVector>>(v, n));
                    }//  w w  w . j  a  v a 2 s  . c  om

                    return r;
                }
            });
}

From source file:info.debatty.spark.knngraphs.builder.NNCTPH.java

License:Open Source License

@Override
protected JavaPairRDD<Integer, Node<String>> _binNodes(JavaRDD<Node<String>> nodes) {
    return nodes.flatMapToPair(new PairFlatMapFunction<Node<String>, Integer, Node<String>>() {

        public Iterable<Tuple2<Integer, Node<String>>> call(Node<String> n) throws Exception {

            ESSum ess = new ESSum(stages, buckets, 1);

            ArrayList<Tuple2<Integer, Node<String>>> r = new ArrayList<Tuple2<Integer, Node<String>>>();
            int[] hash = ess.HashString(n.value);
            for (int v : hash) {
                r.add(new Tuple2<Integer, Node<String>>(v, n));
            }/*from   w w w .  j  av  a 2 s. c  om*/

            return r;
        }
    });
}

From source file:it.unitn.spark.examples.bigdata2015.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from   w ww . j  a  v a 2 s. c  om*/
    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank").setMaster("local[2]");
    ;
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    // URL neighbor URL
    // URL neighbor URL
    // URL neighbor URL
    // ...
    JavaRDD<String> lines = sc.textFile(args[0], 1);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and
    // initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank
    // algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results.iterator();
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    sc.stop();
    sc.close();
}

From source file:org.apache.blur.spark.BlurMRBulkLoadSparkProcessor.java

License:Apache License

@Override
protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() {
    return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() {
        @Override//w ww. j a  va 2  s.  com
        public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception {

            // Blur Table Details
            Iface client = getBlurClient();
            TableDescriptor tableDescriptor = client.describe(getBlurTableName());
            Configuration conf = new Configuration();
            // Blur specific Configuration
            conf.setClass(MAPREDUCE_PARTITIONER_CLASS, BlurPartitioner.class, Partitioner.class);
            conf.set(MAPRED_OUTPUT_COMMITTER_CLASS, BlurOutputCommitter.class.getName());

            // Partition RDD to match Blur Table Shard Count. Used Custom
            // Partitioner to channel correct BlurMutate to correct Shard.
            BlurSparkPartitioner blurSparkPartitioner = new BlurSparkPartitioner(
                    tableDescriptor.getShardCount());
            JavaPairRDD<Text, BlurMutate> flatMapToPair = rdd
                    .flatMapToPair(new PairFlatMapFunction<Tuple2<String, RowMutation>, Text, BlurMutate>() {
                        @Override
                        public Iterable<Tuple2<Text, BlurMutate>> call(Tuple2<String, RowMutation> tuple2)
                                throws Exception {
                            RowMutation rowMutation = tuple2._2;
                            final List<BlurMutate> result = new ArrayList<BlurMutate>();
                            List<RecordMutation> recordMutations = rowMutation.getRecordMutations();
                            String rowId = rowMutation.getRowId();
                            for (RecordMutation recordMutation : recordMutations) {
                                Record record = recordMutation.getRecord();
                                String family = record.getFamily();
                                String recordId = record.getRecordId();
                                List<BlurColumn> columns = toColumns(record.getColumns());

                                BlurRecord blurRecord = new BlurRecord();
                                blurRecord.setRowId(rowId);
                                blurRecord.setFamily(family);
                                blurRecord.setRecordId(recordId);
                                blurRecord.setColumns(columns);
                                result.add(new BlurMutate(MUTATE_TYPE.REPLACE, blurRecord));
                            }
                            return new Iterable<Tuple2<Text, BlurMutate>>() {
                                @Override
                                public Iterator<Tuple2<Text, BlurMutate>> iterator() {
                                    final Iterator<BlurMutate> iterator = result.iterator();
                                    return new Iterator<Tuple2<Text, BlurMutate>>() {

                                        @Override
                                        public boolean hasNext() {
                                            return iterator.hasNext();
                                        }

                                        @Override
                                        public Tuple2<Text, BlurMutate> next() {
                                            BlurMutate blurMutate = iterator.next();
                                            return new Tuple2<Text, BlurMutate>(
                                                    new Text(blurMutate.getRecord().getRowId()), blurMutate);
                                        }

                                        @Override
                                        public void remove() {

                                        }
                                    };
                                }
                            };
                        }

                        private List<BlurColumn> toColumns(List<Column> columns) {
                            List<BlurColumn> cols = new ArrayList<BlurColumn>();
                            for (Column column : columns) {
                                cols.add(new BlurColumn(column.getName(), column.getValue()));
                            }
                            return cols;
                        }
                    });

            final JavaPairRDD<Text, BlurMutate> pRdd = flatMapToPair.partitionBy(blurSparkPartitioner)
                    .persist(getStorageLevel());
            Job job = new Job(conf);
            BlurOutputFormat.setupJob(job, tableDescriptor);
            Path path = new Path(getOutputPath());
            FileSystem fileSystem = path.getFileSystem(conf);
            Path qualified = fileSystem.makeQualified(path);
            BlurOutputFormat.setOutputPath(job, qualified);
            setupBlurHadoopConfig(job.getConfiguration());
            // Write the RDD to Blur Table
            if (pRdd.count() > 0) {
                pRdd.saveAsNewAPIHadoopFile(tableDescriptor.getTableUri(), Text.class, BlurMutate.class,
                        BlurOutputFormat.class, job.getConfiguration());
                client.loadData(getBlurTableName(), qualified.toString());
            }
            return null;
        }
    };
}

From source file:org.apache.kylin.engine.spark.SparkCubing.java

License:Apache License

/** return hfile location */
private String build(JavaRDD<List<String>> javaRDD, final String cubeName, final String segmentId,
        final byte[][] splitKeys) throws Exception {
    CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName);
    CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
    List<TblColRef> baseCuboidColumn = Cuboid.findById(cubeDesc, Cuboid.getBaseCuboidId(cubeDesc)).getColumns();
    final Map<TblColRef, Integer> columnLengthMap = Maps.newHashMap();
    final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap();
    for (TblColRef tblColRef : baseCuboidColumn) {
        columnLengthMap.put(tblColRef, dimEncMap.get(tblColRef).getLengthOfEncoding());
    }/*  w  w  w. ja va2 s. c  om*/
    final Map<TblColRef, Dictionary<String>> dictionaryMap = Maps.newHashMap();
    for (DimensionDesc dim : cubeDesc.getDimensions()) {
        // dictionary
        for (TblColRef col : dim.getColumnRefs()) {
            if (cubeDesc.getRowkey().isUseDictionary(col)) {
                Dictionary<String> dict = cubeSegment.getDictionary(col);
                if (dict == null) {
                    System.err.println("Dictionary for " + col + " was not found.");
                    continue;
                }
                dictionaryMap.put(col, dict);
                System.out.println("col:" + col + " dictionary size:" + dict.getSize());
            }
        }
    }

    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        FunctionDesc func = measureDesc.getFunction();
        List<TblColRef> colRefs = func.getMeasureType().getColumnsNeedDictionary(func);
        for (TblColRef col : colRefs) {
            dictionaryMap.put(col, cubeSegment.getDictionary(col));
        }
    }

    final JavaPairRDD<byte[], byte[]> javaPairRDD = javaRDD.glom()
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<List<List<String>>>, byte[], byte[]>() {

                @Override
                public Iterable<Tuple2<byte[], byte[]>> call(Iterator<List<List<String>>> listIterator)
                        throws Exception {
                    long t = System.currentTimeMillis();
                    prepare();

                    final CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv())
                            .getCube(cubeName);

                    LinkedBlockingQueue<List<String>> blockingQueue = new LinkedBlockingQueue();
                    System.out.println("load properties finished");
                    IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(cubeSegment);
                    AbstractInMemCubeBuilder inMemCubeBuilder = new DoggedCubeBuilder(
                            cubeInstance.getDescriptor(), flatDesc, dictionaryMap);
                    final SparkCuboidWriter sparkCuboidWriter = new BufferedCuboidWriter(
                            new DefaultTupleConverter(cubeInstance.getSegmentById(segmentId), columnLengthMap));
                    Executors.newCachedThreadPool()
                            .submit(inMemCubeBuilder.buildAsRunnable(blockingQueue, sparkCuboidWriter));
                    try {
                        while (listIterator.hasNext()) {
                            for (List<String> row : listIterator.next()) {
                                blockingQueue.put(row);
                            }
                        }
                        blockingQueue.put(Collections.<String>emptyList());
                    } catch (Exception e) {
                        throw new RuntimeException(e);
                    }
                    System.out.println("build partition cost: " + (System.currentTimeMillis() - t) + "ms");
                    return sparkCuboidWriter.getResult();
                }
            });

    KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
    Configuration conf = getConfigurationForHFile(cubeSegment.getStorageLocationIdentifier());
    Path path = new Path(kylinConfig.getHdfsWorkingDirectory(), "hfile_" + UUID.randomUUID().toString());
    Preconditions.checkArgument(!FileSystem.get(conf).exists(path));
    String url = conf.get("fs.defaultFS") + path.toString();
    System.out.println("use " + url + " as hfile");
    List<MeasureDesc> measuresDescs = cubeDesc.getMeasures();
    final int measureSize = measuresDescs.size();
    final String[] dataTypes = new String[measureSize];
    for (int i = 0; i < dataTypes.length; i++) {
        dataTypes[i] = measuresDescs.get(i).getFunction().getReturnType();
    }
    final MeasureAggregators aggs = new MeasureAggregators(measuresDescs);
    writeToHFile2(javaPairRDD, dataTypes, measureSize, aggs, splitKeys, conf, url);
    return url;
}

From source file:org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java

License:Apache License

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH));
    final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH);
    final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
            KeyValueCreator.class, KeyValue.class, RowKeyWritable.class };

    SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration());
        if (!fs.exists(partitionFilePath)) {
            throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString());
        }//  w w  w.  j  a  v  a 2 s . co  m

        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
        final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());

        final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
        final CubeDesc cubeDesc = cubeInstance.getDescriptor();
        final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

        final MeasureCodec inputCodec = new MeasureCodec(cubeDesc.getMeasures());
        final List<KeyValueCreator> keyValueCreators = Lists.newArrayList();

        for (HBaseColumnFamilyDesc cfDesc : cubeDesc.getHbaseMapping().getColumnFamily()) {
            for (HBaseColumnDesc colDesc : cfDesc.getColumns()) {
                keyValueCreators.add(new KeyValueCreator(cubeDesc, colDesc));
            }
        }

        final int cfNum = keyValueCreators.size();
        final boolean quickPath = (keyValueCreators.size() == 1) && keyValueCreators.get(0).isFullCopy;

        logger.info("Input path: {}", inputPath);
        logger.info("Output path: {}", outputPath);
        // read partition split keys
        List<RowKeyWritable> keys = new ArrayList<>();
        try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath,
                sc.hadoopConfiguration())) {
            RowKeyWritable key = new RowKeyWritable();
            Writable value = NullWritable.get();
            while (reader.next(key, value)) {
                keys.add(key);
                logger.info(" ------- split key: {}", key);
                key = new RowKeyWritable(); // important, new an object!
            }
        }

        logger.info("There are {} split keys, totally {} hfiles", keys.size(), (keys.size() + 1));

        //HBase conf
        logger.info("Loading HBase configuration from:{}", hbaseConfFile);
        final Path hbaseConfFilePath = new Path(hbaseConfFile);
        final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration());

        try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) {
            Configuration hbaseJobConf = new Configuration();
            hbaseJobConf.addResource(confInput);
            hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3
            Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier());

            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            // inputPath has the same FileSystem as hbaseClusterFs when in HBase standalone mode
            JavaPairRDD<Text, Text> inputRDDs = SparkUtil.parseInputPath(inputPath, hbaseClusterFs, sc,
                    Text.class, Text.class);
            final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd;
            if (quickPath) {
                hfilerdd = inputRDDs
                        .mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                            @Override
                            public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2)
                                    throws Exception {
                                KeyValue outputValue = keyValueCreators.get(0).create(textTextTuple2._1,
                                        textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength());
                                return new Tuple2<>(
                                        new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                        outputValue);
                            }
                        });
            } else {
                hfilerdd = inputRDDs
                        .flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                            @Override
                            public Iterator<Tuple2<RowKeyWritable, KeyValue>> call(
                                    Tuple2<Text, Text> textTextTuple2) throws Exception {

                                List<Tuple2<RowKeyWritable, KeyValue>> result = Lists
                                        .newArrayListWithExpectedSize(cfNum);
                                Object[] inputMeasures = new Object[cubeDesc.getMeasures().size()];
                                inputCodec.decode(ByteBuffer.wrap(textTextTuple2._2.getBytes(), 0,
                                        textTextTuple2._2.getLength()), inputMeasures);

                                for (int i = 0; i < cfNum; i++) {
                                    KeyValue outputValue = keyValueCreators.get(i).create(textTextTuple2._1,
                                            inputMeasures);
                                    result.add(new Tuple2<>(
                                            new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                            outputValue));
                                }

                                return result.iterator();
                            }
                        });
            }

            hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys),
                    RowKeyWritable.RowKeyComparator.INSTANCE)
                    .mapToPair(
                            new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
                                @Override
                                public Tuple2<ImmutableBytesWritable, KeyValue> call(
                                        Tuple2<RowKeyWritable, KeyValue> rowKeyWritableKeyValueTuple2)
                                        throws Exception {
                                    return new Tuple2<>(
                                            new ImmutableBytesWritable(
                                                    rowKeyWritableKeyValueTuple2._2.getKey()),
                                            rowKeyWritableKeyValueTuple2._2);
                                }
                            })
                    .saveAsNewAPIHadoopDataset(job.getConfiguration());
        }

        logger.info("HDFS: Number of bytes written={}", jobListener.metrics.getBytesWritten());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.HDFS_BYTES_WRITTEN,
                String.valueOf(jobListener.metrics.getBytesWritten()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
    }
}

From source file:org.biocaddie.citationanalysis.metrics.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);//from   w  ww  .  j a  v  a  2 s  .  c o m
    }

    showWarning();

    JavaSparkContext ctx = SparkUtils.getJavaSparkContext("JavaPageRank");

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    //    JavaRDD<String> lines = ctx.textFile(args[0], 1);
    JavaRDD<String> lines = ctx.textFile(args[0]);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    ctx.stop();
}

From source file:org.biocaddie.citationanalysis.metrics.JavaPageRankInt.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from   w  w  w. java  2  s .c om*/
    }

    double alpha = 0.5;
    JavaSparkContext ctx = SparkUtils.getJavaSparkContext("JavaPageRank");

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    //    JavaRDD<String> lines = ctx.textFile(args[0], 1);
    JavaRDD<String> lines = ctx.textFile(args[0]);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<Integer, Iterable<Integer>> links = lines
            .mapToPair(new PairFunction<String, Integer, Integer>() {
                @Override
                public Tuple2<Integer, Integer> call(String s) {
                    String[] parts = SPACES.split(s);
                    return new Tuple2<Integer, Integer>(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]));
                }
            }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.

    JavaPairRDD<Integer, Double> ranks = links.mapValues(new Function<Iterable<Integer>, Double>() {
        @Override
        public Double call(Iterable<Integer> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[2]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<Integer, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<Integer>, Double>, Integer, Double>() {
                    @Override
                    public Iterable<Tuple2<Integer, Double>> call(Tuple2<Iterable<Integer>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<Integer, Double>> results = new ArrayList<Tuple2<Integer, Double>>();
                        for (Integer n : s._1) {
                            results.add(new Tuple2<Integer, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return alpha + sum * 1 - alpha;
                //         return 0.15 + sum * 0.85;
            }
        });
    }

    JavaRDD<String> idLines = ctx.textFile(args[1]);
    JavaPairRDD<Integer, Integer> pmIds = idLines.mapToPair(new PairFunction<String, Integer, Integer>() {
        @Override
        public Tuple2<Integer, Integer> call(String s) {
            String[] parts = s.split(",");
            return new Tuple2<Integer, Integer>(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]));
        }
    });

    ranks = ranks.filter(t -> t._2 > 0.8);

    JavaPairRDD<Integer, Tuple2<Double, Integer>> join = ranks.join(pmIds);

    List<Tuple2<Integer, Tuple2<Double, Integer>>> collect = join.collect();
    for (Tuple2<Integer, Tuple2<Double, Integer>> t : collect) {
        System.out.println(t._1 + "," + t._2._2 + "," + t._2._1);
    }

    // Collects all URL ranks and dump them to console.
    //    List<Tuple2<Integer, Double>> output = ranks.collect();
    //    for (Tuple2<?,?> tuple : output) {
    //        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    //    }

    ctx.stop();
}

From source file:org.kaaproject.examples.spark.KaaSparkExample.java

License:Apache License

@SuppressWarnings("serial")
public static void main(String[] args) throws Exception {
    // Initializing Spark streaming context
    JavaStreamingContext ssc = new JavaStreamingContext(new JavaSparkContext(new SparkConf()), BATCH_DURATION);

    // Creating Flume stream to consume the data
    LOG.info("Binding flume stream to {}:{}", args[0], args[1]);
    JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, args[0],
            Integer.parseInt(args[1]));

    // Decode and map incoming events to <ZoneID, ZoneStats> pairs
    JavaPairDStream<Integer, ZoneStats> zoneVoltageDstream = flumeStream
            .flatMapToPair(new PairFlatMapFunction<SparkFlumeEvent, Integer, ZoneStats>() {

                @Override//from  w  ww .  j  av a  2s  .  co m
                public Iterable<Tuple2<Integer, ZoneStats>> call(SparkFlumeEvent sparkFlumeEvent)
                        throws Exception {
                    List<Tuple2<Integer, ZoneStats>> results = new ArrayList<Tuple2<Integer, ZoneStats>>();

                    // Iterating through each event
                    for (PowerReport record : reader.decodeRecords(sparkFlumeEvent.event().getBody())) {
                        LOG.info("Parsed record: {}", record);
                        // Iterating through per panel samples
                        for (PowerSample sample : record.getSamples()) {
                            results.add(new Tuple2<Integer, ZoneStats>(sample.getZoneId(),
                                    new ZoneStats(1, sample.getPower())));
                        }
                    }

                    LOG.info("Event parsed.");
                    return results;
                }

            });

    // Apply simple reduce function to all <ZoneID, ZoneStats> pairs in
    // order to calculate average and total power produced in each zone.
    zoneVoltageDstream.reduceByKey(
            new Function2<KaaSparkExample.ZoneStats, KaaSparkExample.ZoneStats, KaaSparkExample.ZoneStats>() {

                // Simple reduce function that calculates total panel count
                // and
                // total power produced in scope of each zone.
                @Override
                public ZoneStats call(ZoneStats v1, ZoneStats v2) throws Exception {
                    return new ZoneStats(v1.panelCount + v2.panelCount, v1.powerSum + v2.powerSum);
                }
                // Map results to string for pretty output
            })
            .transformToPair(new Function<JavaPairRDD<Integer, ZoneStats>, JavaPairRDD<Integer, ZoneStats>>() {

                @Override
                public JavaPairRDD<Integer, ZoneStats> call(JavaPairRDD<Integer, ZoneStats> v1)
                        throws Exception {
                    return v1.sortByKey();
                }
            }).map(new Function<Tuple2<Integer, ZoneStats>, String>() {

                @Override
                public String call(Tuple2<Integer, ZoneStats> tuple) throws Exception {
                    StringBuilder sb = new StringBuilder();
                    sb.append("Zone ").append(tuple._1()).append(": ");
                    sb.append("Total power ").append(tuple._2().getTotalPower()).append(" collected from ")
                            .append(tuple._2().getPanelCount()).append(" panels. ");
                    sb.append("Average power produced by each panel is ").append(tuple._2().getAvgPower());
                    return sb.toString();
                }
            }).print();

    // Start streaming application
    ssc.start();
    // Block until terminated
    ssc.awaitTermination();
}

From source file:org.kitesdk.examples.spark.CorrelateEventsTask.java

License:Apache License

public void run() throws IOException {
    Configuration conf = new Configuration();
    DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class);
    DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class);

    // Create our Spark configuration and get a Java context
    SparkConf sparkConf = new SparkConf().setAppName("Correlate Events")
            // Configure the use of Kryo serialization including our Avro registrator
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .set("spark.kryo.registrator", "org.kitesdk.examples.spark.AvroKyroRegistrator");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    JavaPairRDD<StandardEvent, Void> events = sparkContext.newAPIHadoopRDD(conf, DatasetKeyInputFormat.class,
            StandardEvent.class, Void.class);

    // Map each event to two correlation keys. One with the IP address and the
    // nearest 5 minute interval that happened before the event and one with the
    // IP address and the nearest 5 minute interval that happened after the event
    JavaPairRDD<CorrelationKey, StandardEvent> mappedEvents = events.flatMapToPair(
            new PairFlatMapFunction<Tuple2<StandardEvent, Void>, CorrelationKey, StandardEvent>() {
                @Override/*from w  w  w . java 2  s  .  co  m*/
                public Iterable<Tuple2<CorrelationKey, StandardEvent>> call(Tuple2<StandardEvent, Void> t)
                        throws Exception {
                    List<Tuple2<CorrelationKey, StandardEvent>> result = new ArrayList<Tuple2<CorrelationKey, StandardEvent>>(
                            2);

                    StandardEvent event = t._1();
                    long loTimestamp = createLoTimestamp(event.getTimestamp());
                    long hiTimestamp = createHiTimestamp(event.getTimestamp());
                    String ip = event.getIp().toString();

                    result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(loTimestamp, ip),
                            event));
                    result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(hiTimestamp, ip),
                            event));

                    return result;
                }
            });

    // Group the events by they correlation key
    JavaPairRDD<CorrelationKey, Iterable<StandardEvent>> groupedEvents = mappedEvents.groupByKey();

    // Generate potential matches by creating a list of alerts along with the
    // matched list of clicks. If no alerts were found with this correlation
    // key, then output an empty pair
    JavaPairRDD<List<StandardEvent>, List<StandardEvent>> potentialMatches = groupedEvents.mapToPair(
            new PairFunction<Tuple2<CorrelationKey, Iterable<StandardEvent>>, List<StandardEvent>, List<StandardEvent>>() {

                @Override
                public Tuple2<List<StandardEvent>, List<StandardEvent>> call(
                        Tuple2<CorrelationKey, Iterable<StandardEvent>> t) throws Exception {
                    Iterable<StandardEvent> allEvents = t._2();
                    List<StandardEvent> alerts = new ArrayList<StandardEvent>();
                    List<StandardEvent> clicks = new ArrayList<StandardEvent>();

                    for (StandardEvent event : allEvents) {
                        if (event.getEventDetails() != null
                                && event.getEventDetails().containsKey(new Utf8("type"))
                                && "alert".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                            alerts.add(event);
                        } else if (event.getEventDetails() != null
                                && event.getEventDetails().containsKey(new Utf8("type"))
                                && "click".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                            clicks.add(event);
                        }
                    }

                    if (alerts.isEmpty()) {
                        return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, alerts);
                    } else {
                        return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, clicks);
                    }
                }
            });

    // Verify that the matched events are true matches (i.e. the timestamps
    // are really less than or equal to 5 minutes apart
    JavaPairRDD<CorrelatedEvents, Void> matches = potentialMatches.flatMapToPair(
            new PairFlatMapFunction<Tuple2<List<StandardEvent>, List<StandardEvent>>, CorrelatedEvents, Void>() {

                @Override
                public Iterable<Tuple2<CorrelatedEvents, Void>> call(
                        Tuple2<List<StandardEvent>, List<StandardEvent>> t) throws Exception {
                    List<Tuple2<CorrelatedEvents, Void>> results = new ArrayList<Tuple2<CorrelatedEvents, Void>>();
                    List<StandardEvent> alerts = t._1();
                    List<StandardEvent> clicks = t._2();

                    for (StandardEvent alert : alerts) {
                        List<StandardEvent> correlated = new ArrayList<StandardEvent>();
                        for (StandardEvent click : clicks) {
                            if (Math.abs(alert.getTimestamp() - click.getTimestamp()) <= FIVE_MIN_MILLIS) {
                                correlated.add(click);
                            }
                        }
                        if (!correlated.isEmpty()) {
                            results.add(new Tuple2(CorrelatedEvents.newBuilder().setEvent(alert)
                                    .setCorrelated(correlated).build(), null));
                        }
                    }

                    return results;
                }
            });

    // Write the data to a Kite dataset
    matches.saveAsNewAPIHadoopFile("dummy", CorrelatedEvents.class, Void.class, DatasetKeyOutputFormat.class,
            conf);
}