Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:info.debatty.spark.knngraphs.builder.DoubleApproximate.java

License:Open Source License

@Override
protected JavaPairRDD<Integer, Node<SparseDoubleVector>> _binNodes(JavaRDD<Node<SparseDoubleVector>> nodes)
        throws Exception {
    final long count = nodes.count();
    lsh = new info.debatty.java.lsh.LSHSuperBit(stages, buckets, this.dim);

    return nodes.flatMapToPair(
            new PairFlatMapFunction<Node<SparseDoubleVector>, Integer, Node<SparseDoubleVector>>() {

                public Iterable<Tuple2<Integer, Node<SparseDoubleVector>>> call(Node<SparseDoubleVector> n)
                        throws Exception {
                    ArrayList<Tuple2<Integer, Node<SparseDoubleVector>>> r = new ArrayList<Tuple2<Integer, Node<SparseDoubleVector>>>();
                    int[] hash = lsh.hash(n.value);

                    // Downsample vectors using DIMSUM
                    n.value.sampleDIMSUM(0.5, (int) count, dim);

                    for (int v : hash) {
                        r.add(new Tuple2<Integer, Node<SparseDoubleVector>>(v, n));
                    }//  w w  w . j  a  v a 2 s  . c  om

                    return r;
                }
            });
}

From source file:info.debatty.spark.knngraphs.builder.NNCTPH.java

License:Open Source License

@Override
protected JavaPairRDD<Integer, Node<String>> _binNodes(JavaRDD<Node<String>> nodes) {
    return nodes.flatMapToPair(new PairFlatMapFunction<Node<String>, Integer, Node<String>>() {

        public Iterable<Tuple2<Integer, Node<String>>> call(Node<String> n) throws Exception {

            ESSum ess = new ESSum(stages, buckets, 1);

            ArrayList<Tuple2<Integer, Node<String>>> r = new ArrayList<Tuple2<Integer, Node<String>>>();
            int[] hash = ess.HashString(n.value);
            for (int v : hash) {
                r.add(new Tuple2<Integer, Node<String>>(v, n));
            }/*from   w w w .  j  av  a 2 s. c  om*/

            return r;
        }
    });
}

From source file:it.unitn.spark.examples.bigdata2015.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from   w ww . j  a  v a 2 s. c  om*/
    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank").setMaster("local[2]");
    ;
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    // URL neighbor URL
    // URL neighbor URL
    // URL neighbor URL
    // ...
    JavaRDD<String> lines = sc.textFile(args[0], 1);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and
    // initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank
    // algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results.iterator();
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    sc.stop();
    sc.close();
}

From source file:org.apache.blur.spark.BlurMRBulkLoadSparkProcessor.java

License:Apache License

@Override
protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() {
    return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() {
        @Override//w ww. j a  va 2  s.  com
        public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception {

            // Blur Table Details
            Iface client = getBlurClient();
            TableDescriptor tableDescriptor = client.describe(getBlurTableName());
            Configuration conf = new Configuration();
            // Blur specific Configuration
            conf.setClass(MAPREDUCE_PARTITIONER_CLASS, BlurPartitioner.class, Partitioner.class);
            conf.set(MAPRED_OUTPUT_COMMITTER_CLASS, BlurOutputCommitter.class.getName());

            // Partition RDD to match Blur Table Shard Count. Used Custom
            // Partitioner to channel correct BlurMutate to correct Shard.
            BlurSparkPartitioner blurSparkPartitioner = new BlurSparkPartitioner(
                    tableDescriptor.getShardCount());
            JavaPairRDD<Text, BlurMutate> flatMapToPair = rdd
                    .flatMapToPair(new PairFlatMapFunction<Tuple2<String, RowMutation>, Text, BlurMutate>() {
                        @Override
                        public Iterable<Tuple2<Text, BlurMutate>> call(Tuple2<String, RowMutation> tuple2)
                                throws Exception {
                            RowMutation rowMutation = tuple2._2;
                            final List<BlurMutate> result = new ArrayList<BlurMutate>();
                            List<RecordMutation> recordMutations = rowMutation.getRecordMutations();
                            String rowId = rowMutation.getRowId();
                            for (RecordMutation recordMutation : recordMutations) {
                                Record record = recordMutation.getRecord();
                                String family = record.getFamily();
                                String recordId = record.getRecordId();
                                List<BlurColumn> columns = toColumns(record.getColumns());

                                BlurRecord blurRecord = new BlurRecord();
                                blurRecord.setRowId(rowId);
                                blurRecord.setFamily(family);
                                blurRecord.setRecordId(recordId);
                                blurRecord.setColumns(columns);
                                result.add(new BlurMutate(MUTATE_TYPE.REPLACE, blurRecord));
                            }
                            return new Iterable<Tuple2<Text, BlurMutate>>() {
                                @Override
                                public Iterator<Tuple2<Text, BlurMutate>> iterator() {
                                    final Iterator<BlurMutate> iterator = result.iterator();
                                    return new Iterator<Tuple2<Text, BlurMutate>>() {

                                        @Override
                                        public boolean hasNext() {
                                            return iterator.hasNext();
                                        }

                                        @Override
                                        public Tuple2<Text, BlurMutate> next() {
                                            BlurMutate blurMutate = iterator.next();
                                            return new Tuple2<Text, BlurMutate>(
                                                    new Text(blurMutate.getRecord().getRowId()), blurMutate);
                                        }

                                        @Override
                                        public void remove() {

                                        }
                                    };
                                }
                            };
                        }

                        private List<BlurColumn> toColumns(List<Column> columns) {
                            List<BlurColumn> cols = new ArrayList<BlurColumn>();
                            for (Column column : columns) {
                                cols.add(new BlurColumn(column.getName(), column.getValue()));
                            }
                            return cols;
                        }
                    });

            final JavaPairRDD<Text, BlurMutate> pRdd = flatMapToPair.partitionBy(blurSparkPartitioner)
                    .persist(getStorageLevel());
            Job job = new Job(conf);
            BlurOutputFormat.setupJob(job, tableDescriptor);
            Path path = new Path(getOutputPath());
            FileSystem fileSystem = path.getFileSystem(conf);
            Path qualified = fileSystem.makeQualified(path);
            BlurOutputFormat.setOutputPath(job, qualified);
            setupBlurHadoopConfig(job.getConfiguration());
            // Write the RDD to Blur Table
            if (pRdd.count() > 0) {
                pRdd.saveAsNewAPIHadoopFile(tableDescriptor.getTableUri(), Text.class, BlurMutate.class,
                        BlurOutputFormat.class, job.getConfiguration());
                client.loadData(getBlurTableName(), qualified.toString());
            }
            return null;
        }
    };
}

From source file:org.apache.kylin.engine.spark.SparkCubing.java

License:Apache License

/** return hfile location */
private String build(JavaRDD<List<String>> javaRDD, final String cubeName, final String segmentId,
        final byte[][] splitKeys) throws Exception {
    CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName);
    CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);
    List<TblColRef> baseCuboidColumn = Cuboid.findById(cubeDesc, Cuboid.getBaseCuboidId(cubeDesc)).getColumns();
    final Map<TblColRef, Integer> columnLengthMap = Maps.newHashMap();
    final CubeDimEncMap dimEncMap = cubeSegment.getDimensionEncodingMap();
    for (TblColRef tblColRef : baseCuboidColumn) {
        columnLengthMap.put(tblColRef, dimEncMap.get(tblColRef).getLengthOfEncoding());
    }/*  w  w  w. ja va2 s. c  om*/
    final Map<TblColRef, Dictionary<String>> dictionaryMap = Maps.newHashMap();
    for (DimensionDesc dim : cubeDesc.getDimensions()) {
        // dictionary
        for (TblColRef col : dim.getColumnRefs()) {
            if (cubeDesc.getRowkey().isUseDictionary(col)) {
                Dictionary<String> dict = cubeSegment.getDictionary(col);
                if (dict == null) {
                    System.err.println("Dictionary for " + col + " was not found.");
                    continue;
                }
                dictionaryMap.put(col, dict);
                System.out.println("col:" + col + " dictionary size:" + dict.getSize());
            }
        }
    }

    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        FunctionDesc func = measureDesc.getFunction();
        List<TblColRef> colRefs = func.getMeasureType().getColumnsNeedDictionary(func);
        for (TblColRef col : colRefs) {
            dictionaryMap.put(col, cubeSegment.getDictionary(col));
        }
    }

    final JavaPairRDD<byte[], byte[]> javaPairRDD = javaRDD.glom()
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<List<List<String>>>, byte[], byte[]>() {

                @Override
                public Iterable<Tuple2<byte[], byte[]>> call(Iterator<List<List<String>>> listIterator)
                        throws Exception {
                    long t = System.currentTimeMillis();
                    prepare();

                    final CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv())
                            .getCube(cubeName);

                    LinkedBlockingQueue<List<String>> blockingQueue = new LinkedBlockingQueue();
                    System.out.println("load properties finished");
                    IJoinedFlatTableDesc flatDesc = EngineFactory.getJoinedFlatTableDesc(cubeSegment);
                    AbstractInMemCubeBuilder inMemCubeBuilder = new DoggedCubeBuilder(
                            cubeInstance.getDescriptor(), flatDesc, dictionaryMap);
                    final SparkCuboidWriter sparkCuboidWriter = new BufferedCuboidWriter(
                            new DefaultTupleConverter(cubeInstance.getSegmentById(segmentId), columnLengthMap));
                    Executors.newCachedThreadPool()
                            .submit(inMemCubeBuilder.buildAsRunnable(blockingQueue, sparkCuboidWriter));
                    try {
                        while (listIterator.hasNext()) {
                            for (List<String> row : listIterator.next()) {
                                blockingQueue.put(row);
                            }
                        }
                        blockingQueue.put(Collections.<String>emptyList());
                    } catch (Exception e) {
                        throw new RuntimeException(e);
                    }
                    System.out.println("build partition cost: " + (System.currentTimeMillis() - t) + "ms");
                    return sparkCuboidWriter.getResult();
                }
            });

    KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
    Configuration conf = getConfigurationForHFile(cubeSegment.getStorageLocationIdentifier());
    Path path = new Path(kylinConfig.getHdfsWorkingDirectory(), "hfile_" + UUID.randomUUID().toString());
    Preconditions.checkArgument(!FileSystem.get(conf).exists(path));
    String url = conf.get("fs.defaultFS") + path.toString();
    System.out.println("use " + url + " as hfile");
    List<MeasureDesc> measuresDescs = cubeDesc.getMeasures();
    final int measureSize = measuresDescs.size();
    final String[] dataTypes = new String[measureSize];
    for (int i = 0; i < dataTypes.length; i++) {
        dataTypes[i] = measuresDescs.get(i).getFunction().getReturnType();
    }
    final MeasureAggregators aggs = new MeasureAggregators(measuresDescs);
    writeToHFile2(javaPairRDD, dataTypes, measureSize, aggs, splitKeys, conf, url);
    return url;
}

From source file:org.apache.kylin.storage.hbase.steps.SparkCubeHFile.java

License:Apache License

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH));
    final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH);
    final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
            KeyValueCreator.class, KeyValue.class, RowKeyWritable.class };

    SparkConf conf = new SparkConf().setAppName("Converting HFile for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration());
        if (!fs.exists(partitionFilePath)) {
            throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString());
        }//  w w  w.  j  a  v  a 2 s . co  m

        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
        final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());

        final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
        final CubeDesc cubeDesc = cubeInstance.getDescriptor();
        final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

        final MeasureCodec inputCodec = new MeasureCodec(cubeDesc.getMeasures());
        final List<KeyValueCreator> keyValueCreators = Lists.newArrayList();

        for (HBaseColumnFamilyDesc cfDesc : cubeDesc.getHbaseMapping().getColumnFamily()) {
            for (HBaseColumnDesc colDesc : cfDesc.getColumns()) {
                keyValueCreators.add(new KeyValueCreator(cubeDesc, colDesc));
            }
        }

        final int cfNum = keyValueCreators.size();
        final boolean quickPath = (keyValueCreators.size() == 1) && keyValueCreators.get(0).isFullCopy;

        logger.info("Input path: {}", inputPath);
        logger.info("Output path: {}", outputPath);
        // read partition split keys
        List<RowKeyWritable> keys = new ArrayList<>();
        try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath,
                sc.hadoopConfiguration())) {
            RowKeyWritable key = new RowKeyWritable();
            Writable value = NullWritable.get();
            while (reader.next(key, value)) {
                keys.add(key);
                logger.info(" ------- split key: {}", key);
                key = new RowKeyWritable(); // important, new an object!
            }
        }

        logger.info("There are {} split keys, totally {} hfiles", keys.size(), (keys.size() + 1));

        //HBase conf
        logger.info("Loading HBase configuration from:{}", hbaseConfFile);
        final Path hbaseConfFilePath = new Path(hbaseConfFile);
        final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration());

        try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) {
            Configuration hbaseJobConf = new Configuration();
            hbaseJobConf.addResource(confInput);
            hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3
            Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier());

            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            // inputPath has the same FileSystem as hbaseClusterFs when in HBase standalone mode
            JavaPairRDD<Text, Text> inputRDDs = SparkUtil.parseInputPath(inputPath, hbaseClusterFs, sc,
                    Text.class, Text.class);
            final JavaPairRDD<RowKeyWritable, KeyValue> hfilerdd;
            if (quickPath) {
                hfilerdd = inputRDDs
                        .mapToPair(new PairFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                            @Override
                            public Tuple2<RowKeyWritable, KeyValue> call(Tuple2<Text, Text> textTextTuple2)
                                    throws Exception {
                                KeyValue outputValue = keyValueCreators.get(0).create(textTextTuple2._1,
                                        textTextTuple2._2.getBytes(), 0, textTextTuple2._2.getLength());
                                return new Tuple2<>(
                                        new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                        outputValue);
                            }
                        });
            } else {
                hfilerdd = inputRDDs
                        .flatMapToPair(new PairFlatMapFunction<Tuple2<Text, Text>, RowKeyWritable, KeyValue>() {
                            @Override
                            public Iterator<Tuple2<RowKeyWritable, KeyValue>> call(
                                    Tuple2<Text, Text> textTextTuple2) throws Exception {

                                List<Tuple2<RowKeyWritable, KeyValue>> result = Lists
                                        .newArrayListWithExpectedSize(cfNum);
                                Object[] inputMeasures = new Object[cubeDesc.getMeasures().size()];
                                inputCodec.decode(ByteBuffer.wrap(textTextTuple2._2.getBytes(), 0,
                                        textTextTuple2._2.getLength()), inputMeasures);

                                for (int i = 0; i < cfNum; i++) {
                                    KeyValue outputValue = keyValueCreators.get(i).create(textTextTuple2._1,
                                            inputMeasures);
                                    result.add(new Tuple2<>(
                                            new RowKeyWritable(outputValue.createKeyOnly(false).getKey()),
                                            outputValue));
                                }

                                return result.iterator();
                            }
                        });
            }

            hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys),
                    RowKeyWritable.RowKeyComparator.INSTANCE)
                    .mapToPair(
                            new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
                                @Override
                                public Tuple2<ImmutableBytesWritable, KeyValue> call(
                                        Tuple2<RowKeyWritable, KeyValue> rowKeyWritableKeyValueTuple2)
                                        throws Exception {
                                    return new Tuple2<>(
                                            new ImmutableBytesWritable(
                                                    rowKeyWritableKeyValueTuple2._2.getKey()),
                                            rowKeyWritableKeyValueTuple2._2);
                                }
                            })
                    .saveAsNewAPIHadoopDataset(job.getConfiguration());
        }

        logger.info("HDFS: Number of bytes written={}", jobListener.metrics.getBytesWritten());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.HDFS_BYTES_WRITTEN,
                String.valueOf(jobListener.metrics.getBytesWritten()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
    }
}

From source file:org.biocaddie.citationanalysis.metrics.JavaPageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);//from   w  ww  .  j a  v  a  2 s  .  c o m
    }

    showWarning();

    JavaSparkContext ctx = SparkUtils.getJavaSparkContext("JavaPageRank");

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    //    JavaRDD<String> lines = ctx.textFile(args[0], 1);
    JavaRDD<String> lines = ctx.textFile(args[0]);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }

    ctx.stop();
}

From source file:org.biocaddie.citationanalysis.metrics.JavaPageRankInt.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/*from   w  w  w. java  2  s .c om*/
    }

    double alpha = 0.5;
    JavaSparkContext ctx = SparkUtils.getJavaSparkContext("JavaPageRank");

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    //    JavaRDD<String> lines = ctx.textFile(args[0], 1);
    JavaRDD<String> lines = ctx.textFile(args[0]);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<Integer, Iterable<Integer>> links = lines
            .mapToPair(new PairFunction<String, Integer, Integer>() {
                @Override
                public Tuple2<Integer, Integer> call(String s) {
                    String[] parts = SPACES.split(s);
                    return new Tuple2<Integer, Integer>(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]));
                }
            }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.

    JavaPairRDD<Integer, Double> ranks = links.mapValues(new Function<Iterable<Integer>, Double>() {
        @Override
        public Double call(Iterable<Integer> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[2]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<Integer, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<Integer>, Double>, Integer, Double>() {
                    @Override
                    public Iterable<Tuple2<Integer, Double>> call(Tuple2<Iterable<Integer>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<Integer, Double>> results = new ArrayList<Tuple2<Integer, Double>>();
                        for (Integer n : s._1) {
                            results.add(new Tuple2<Integer, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return alpha + sum * 1 - alpha;
                //         return 0.15 + sum * 0.85;
            }
        });
    }

    JavaRDD<String> idLines = ctx.textFile(args[1]);
    JavaPairRDD<Integer, Integer> pmIds = idLines.mapToPair(new PairFunction<String, Integer, Integer>() {
        @Override
        public Tuple2<Integer, Integer> call(String s) {
            String[] parts = s.split(",");
            return new Tuple2<Integer, Integer>(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]));
        }
    });

    ranks = ranks.filter(t -> t._2 > 0.8);

    JavaPairRDD<Integer, Tuple2<Double, Integer>> join = ranks.join(pmIds);

    List<Tuple2<Integer, Tuple2<Double, Integer>>> collect = join.collect();
    for (Tuple2<Integer, Tuple2<Double, Integer>> t : collect) {
        System.out.println(t._1 + "," + t._2._2 + "," + t._2._1);
    }

    // Collects all URL ranks and dump them to console.
    //    List<Tuple2<Integer, Double>> output = ranks.collect();
    //    for (Tuple2<?,?> tuple : output) {
    //        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    //    }

    ctx.stop();
}

From source file:org.kaaproject.examples.spark.KaaSparkExample.java

License:Apache License

@SuppressWarnings("serial")
public static void main(String[] args) throws Exception {
    // Initializing Spark streaming context
    JavaStreamingContext ssc = new JavaStreamingContext(new JavaSparkContext(new SparkConf()), BATCH_DURATION);

    // Creating Flume stream to consume the data
    LOG.info("Binding flume stream to {}:{}", args[0], args[1]);
    JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, args[0],
            Integer.parseInt(args[1]));

    // Decode and map incoming events to <ZoneID, ZoneStats> pairs
    JavaPairDStream<Integer, ZoneStats> zoneVoltageDstream = flumeStream
            .flatMapToPair(new PairFlatMapFunction<SparkFlumeEvent, Integer, ZoneStats>() {

                @Override//from  w  ww .  j  av a  2s  .  co m
                public Iterable<Tuple2<Integer, ZoneStats>> call(SparkFlumeEvent sparkFlumeEvent)
                        throws Exception {
                    List<Tuple2<Integer, ZoneStats>> results = new ArrayList<Tuple2<Integer, ZoneStats>>();

                    // Iterating through each event
                    for (PowerReport record : reader.decodeRecords(sparkFlumeEvent.event().getBody())) {
                        LOG.info("Parsed record: {}", record);
                        // Iterating through per panel samples
                        for (PowerSample sample : record.getSamples()) {
                            results.add(new Tuple2<Integer, ZoneStats>(sample.getZoneId(),
                                    new ZoneStats(1, sample.getPower())));
                        }
                    }

                    LOG.info("Event parsed.");
                    return results;
                }

            });

    // Apply simple reduce function to all <ZoneID, ZoneStats> pairs in
    // order to calculate average and total power produced in each zone.
    zoneVoltageDstream.reduceByKey(
            new Function2<KaaSparkExample.ZoneStats, KaaSparkExample.ZoneStats, KaaSparkExample.ZoneStats>() {

                // Simple reduce function that calculates total panel count
                // and
                // total power produced in scope of each zone.
                @Override
                public ZoneStats call(ZoneStats v1, ZoneStats v2) throws Exception {
                    return new ZoneStats(v1.panelCount + v2.panelCount, v1.powerSum + v2.powerSum);
                }
                // Map results to string for pretty output
            })
            .transformToPair(new Function<JavaPairRDD<Integer, ZoneStats>, JavaPairRDD<Integer, ZoneStats>>() {

                @Override
                public JavaPairRDD<Integer, ZoneStats> call(JavaPairRDD<Integer, ZoneStats> v1)
                        throws Exception {
                    return v1.sortByKey();
                }
            }).map(new Function<Tuple2<Integer, ZoneStats>, String>() {

                @Override
                public String call(Tuple2<Integer, ZoneStats> tuple) throws Exception {
                    StringBuilder sb = new StringBuilder();
                    sb.append("Zone ").append(tuple._1()).append(": ");
                    sb.append("Total power ").append(tuple._2().getTotalPower()).append(" collected from ")
                            .append(tuple._2().getPanelCount()).append(" panels. ");
                    sb.append("Average power produced by each panel is ").append(tuple._2().getAvgPower());
                    return sb.toString();
                }
            }).print();

    // Start streaming application
    ssc.start();
    // Block until terminated
    ssc.awaitTermination();
}

From source file:org.kitesdk.examples.spark.CorrelateEventsTask.java

License:Apache License

public void run() throws IOException {
    Configuration conf = new Configuration();
    DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class);
    DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class);

    // Create our Spark configuration and get a Java context
    SparkConf sparkConf = new SparkConf().setAppName("Correlate Events")
            // Configure the use of Kryo serialization including our Avro registrator
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .set("spark.kryo.registrator", "org.kitesdk.examples.spark.AvroKyroRegistrator");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    JavaPairRDD<StandardEvent, Void> events = sparkContext.newAPIHadoopRDD(conf, DatasetKeyInputFormat.class,
            StandardEvent.class, Void.class);

    // Map each event to two correlation keys. One with the IP address and the
    // nearest 5 minute interval that happened before the event and one with the
    // IP address and the nearest 5 minute interval that happened after the event
    JavaPairRDD<CorrelationKey, StandardEvent> mappedEvents = events.flatMapToPair(
            new PairFlatMapFunction<Tuple2<StandardEvent, Void>, CorrelationKey, StandardEvent>() {
                @Override/*from w  w  w . java 2  s  .  co  m*/
                public Iterable<Tuple2<CorrelationKey, StandardEvent>> call(Tuple2<StandardEvent, Void> t)
                        throws Exception {
                    List<Tuple2<CorrelationKey, StandardEvent>> result = new ArrayList<Tuple2<CorrelationKey, StandardEvent>>(
                            2);

                    StandardEvent event = t._1();
                    long loTimestamp = createLoTimestamp(event.getTimestamp());
                    long hiTimestamp = createHiTimestamp(event.getTimestamp());
                    String ip = event.getIp().toString();

                    result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(loTimestamp, ip),
                            event));
                    result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(hiTimestamp, ip),
                            event));

                    return result;
                }
            });

    // Group the events by they correlation key
    JavaPairRDD<CorrelationKey, Iterable<StandardEvent>> groupedEvents = mappedEvents.groupByKey();

    // Generate potential matches by creating a list of alerts along with the
    // matched list of clicks. If no alerts were found with this correlation
    // key, then output an empty pair
    JavaPairRDD<List<StandardEvent>, List<StandardEvent>> potentialMatches = groupedEvents.mapToPair(
            new PairFunction<Tuple2<CorrelationKey, Iterable<StandardEvent>>, List<StandardEvent>, List<StandardEvent>>() {

                @Override
                public Tuple2<List<StandardEvent>, List<StandardEvent>> call(
                        Tuple2<CorrelationKey, Iterable<StandardEvent>> t) throws Exception {
                    Iterable<StandardEvent> allEvents = t._2();
                    List<StandardEvent> alerts = new ArrayList<StandardEvent>();
                    List<StandardEvent> clicks = new ArrayList<StandardEvent>();

                    for (StandardEvent event : allEvents) {
                        if (event.getEventDetails() != null
                                && event.getEventDetails().containsKey(new Utf8("type"))
                                && "alert".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                            alerts.add(event);
                        } else if (event.getEventDetails() != null
                                && event.getEventDetails().containsKey(new Utf8("type"))
                                && "click".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                            clicks.add(event);
                        }
                    }

                    if (alerts.isEmpty()) {
                        return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, alerts);
                    } else {
                        return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, clicks);
                    }
                }
            });

    // Verify that the matched events are true matches (i.e. the timestamps
    // are really less than or equal to 5 minutes apart
    JavaPairRDD<CorrelatedEvents, Void> matches = potentialMatches.flatMapToPair(
            new PairFlatMapFunction<Tuple2<List<StandardEvent>, List<StandardEvent>>, CorrelatedEvents, Void>() {

                @Override
                public Iterable<Tuple2<CorrelatedEvents, Void>> call(
                        Tuple2<List<StandardEvent>, List<StandardEvent>> t) throws Exception {
                    List<Tuple2<CorrelatedEvents, Void>> results = new ArrayList<Tuple2<CorrelatedEvents, Void>>();
                    List<StandardEvent> alerts = t._1();
                    List<StandardEvent> clicks = t._2();

                    for (StandardEvent alert : alerts) {
                        List<StandardEvent> correlated = new ArrayList<StandardEvent>();
                        for (StandardEvent click : clicks) {
                            if (Math.abs(alert.getTimestamp() - click.getTimestamp()) <= FIVE_MIN_MILLIS) {
                                correlated.add(click);
                            }
                        }
                        if (!correlated.isEmpty()) {
                            results.add(new Tuple2(CorrelatedEvents.newBuilder().setEvent(alert)
                                    .setCorrelated(correlated).build(), null));
                        }
                    }

                    return results;
                }
            });

    // Write the data to a Kite dataset
    matches.saveAsNewAPIHadoopFile("dummy", CorrelatedEvents.class, Void.class, DatasetKeyOutputFormat.class,
            conf);
}