com.audaque.instancematch.match.GenerateSignature.java Source code

Java tutorial

Introduction

Here is the source code for com.audaque.instancematch.match.GenerateSignature.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.audaque.instancematch.match;

import com.audaque.instancematch.hash.Hash;
import com.audaque.instancematch.hash.QGramHash;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;

/**
 *
 * @author Administrator
 */
public class GenerateSignature {
    /**
     * ??q,hashNum???tuple2key???valueJavaPairRDD
     * @param srcFile ?
     * @param seedFile
     * @param q q-gramq
     * @param hashNum 
     * @param sc
     * @return
     * @throws IOException 
     */
    public static List<Tuple2<String, String>> generateSignature(String srcFile, String seedFile, final int q,
            int hashNum, JavaSparkContext sc) throws IOException {
        //        final int[] seeds = Hash.loadSeeds("res/seed0.txt", hashNum);// ??

        JavaRDD<String> srcRDD = sc.textFile(srcFile, 40);
        JavaRDD<String> seedRDD = sc.textFile(seedFile);
        final List<String> seedList = seedRDD.collect();

        JavaPairRDD<String, String> seeds_hashRDD = srcRDD
                .flatMapToPair(new PairFlatMapFunction<String, String, String>() {
                    @Override
                    public Iterable<Tuple2<String, String>> call(String line) throws Exception {
                        List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>();
                        for (int i = 0; i < seedList.size(); i++) {
                            int hash;
                            if (q < 0) {
                                hash = Hash.RSHash(line, Integer.valueOf(seedList.get(i)));
                            } else {
                                hash = QGramHash.RSHash(line, Integer.valueOf(seedList.get(i)), q);
                            }
                            list.add(new Tuple2<String, String>(seedList.get(i), String.valueOf(hash)));
                        }
                        return list;
                    }
                });

        JavaPairRDD<String, String> seed_hashRDD = seeds_hashRDD
                .reduceByKey(new Function2<String, String, String>() {
                    @Override
                    public String call(String v1, String v2) throws Exception {
                        if (Integer.valueOf(v1) < Integer.valueOf(v2)) {
                            return v1;
                        } else {
                            return v2;
                        }
                    }
                });
        //        seeds_hashRDD.sortByKey().saveAsTextFile("hdfs://172.16.1.101:8020/user/ALGO/result");
        return seed_hashRDD.sortByKey().collect();
    }
}