List of usage examples for org.apache.mahout.math.stats LogLikelihood rootLogLikelihoodRatio
public static double rootLogLikelihoodRatio(long k11, long k12, long k21, long k22)
From source file:com.mapr.synth.CommonPointOfCompromiseTest.java
License:Apache License
@Test public void testCompromise() throws IOException, ParseException { SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = df.parse("2014-01-01 00:00:00").getTime(); SchemaSampler s = new SchemaSampler( Resources.asCharSource(Resources.getResource("schema013.json"), Charsets.UTF_8).read()); long exploitStart = df.parse("2014-01-20 00:00:00").getTime(); long exploitEnd = df.parse("2014-02-20 00:00:00").getTime(); int exploitStartDay = (int) TimeUnit.DAYS.convert(exploitStart - start, TimeUnit.MILLISECONDS); int[] transactionsByDay = new int[DAYS_COUNTED]; int[] compromiseByDay = new int[DAYS_COUNTED]; int[] fraudByDay = new int[DAYS_COUNTED]; Multiset<Integer> fraudUserCounts = HashMultiset.create(); Multiset<Integer> nonfraudUserCounts = HashMultiset.create(); Multiset<Integer> allMerchantCounts = HashMultiset.create(); int fraudAccounts = 0; Set<Integer> merchantHistory = Sets.newHashSet(); // these collect the evolution of the contingency table for just merchant 0 and are indexed by time relative to exploit window. int exploitLength = (int) (TimeUnit.DAYS.convert(exploitEnd - exploitStart, TimeUnit.MILLISECONDS)) + 1; // exploitLength = 5; int[] atmTotal = new int[exploitLength]; int[] atmFraud = new int[exploitLength]; int[] atmNonFraud = new int[exploitLength]; int[] nonAtmFraud = new int[exploitLength]; int[] nonAtmNonFraud = new int[exploitLength]; for (int userId = 0; userId < USER_COUNT; userId++) { JsonNode sample = s.sample();//from ww w. j av a2 s . c o m merchantHistory.clear(); boolean userHasFraud = false; int[] hasFraudPerUser = new int[exploitLength]; for (JsonNode record : sample.get("history")) { long timestamp = record.get("timestamp").asLong() * 1000; int day = (int) ((timestamp - start) / TimeUnit.MILLISECONDS.convert(1, TimeUnit.DAYS)); if (day >= DAYS_COUNTED || day >= exploitStartDay + exploitLength) { break; } if (record.get("compromise").asInt() > 0) { compromiseByDay[day]++; } boolean isFraud = record.get("fraud").asInt() > 0; if (isFraud) { fraudByDay[day]++; } transactionsByDay[day]++; // only record history up to the beginning of the exploit window int merchant = record.get("merchant").asInt(); if (timestamp < exploitStart) { merchantHistory.add(merchant); } // only consider fraud indicators during the exploit window if (timestamp >= exploitStart && timestamp <= exploitEnd) { // any fraud in the window marks the user if (isFraud) { // first time we see fraud indication in exploit window, we set flags for the rest of the window if (!userHasFraud) { int eday = day - exploitStartDay; for (int i = eday; i < exploitLength; i++) { hasFraudPerUser[i] = 1; } } userHasFraud = true; } } } // we collect flags for each day and then only count this user once. Necessary because multiple // transactions can occur on each day and we don't want to count all of them. int atmInHistory = merchantHistory.contains(0) ? 1 : 0; for (int day = 0; day < exploitLength; day++) { atmTotal[day] += atmInHistory; atmFraud[day] += atmInHistory * hasFraudPerUser[day]; atmNonFraud[day] += atmInHistory * (1 - hasFraudPerUser[day]); nonAtmFraud[day] += (1 - atmInHistory) * hasFraudPerUser[day]; nonAtmNonFraud[day] += (1 - atmInHistory) * (1 - hasFraudPerUser[day]); } if (userHasFraud) { fraudAccounts++; for (Integer merchant : merchantHistory) { fraudUserCounts.add(merchant); allMerchantCounts.add(merchant); } } else { for (Integer merchant : merchantHistory) { nonfraudUserCounts.add(merchant); allMerchantCounts.add(merchant); } } } int k1 = fraudAccounts; int k2 = USER_COUNT - k1; try (PrintStream out = new PrintStream(new FileOutputStream("scores.tsv"))) { out.printf("merchant\tk11\tk12\tk21\tk22\tk.1\tscore\n"); for (Integer merchant : allMerchantCounts.elementSet()) { int k11 = fraudUserCounts.count(merchant); int k12 = k1 - k11; int k21 = nonfraudUserCounts.count(merchant); int k22 = k2 - k21; out.printf("%d\t%d\t%d\t%d\t%d\t%d\t%.1f\n", merchant, k11, k12, k21, k22, allMerchantCounts.count(merchant), LogLikelihood.rootLogLikelihoodRatio(k11, k12, k21, k22)); } } try (PrintStream out = new PrintStream(new FileOutputStream("counts.tsv"))) { out.printf("day\tcompromises\tfrauds\ttransactions\n"); for (int i = 0; i < compromiseByDay.length; i++) { out.printf("%d\t%d\t%d\t%d\n", i, compromiseByDay[i], fraudByDay[i], transactionsByDay[i]); } } try (PrintStream out = new PrintStream(new FileOutputStream("growth.tsv"))) { out.printf("day\tatm.total\tk11\tk12\tk21\tk22\tscore\n"); for (int i = 0; i < exploitLength; i++) { int k11 = atmFraud[i]; int k12 = nonAtmFraud[i]; int k21 = atmNonFraud[i]; int k22 = nonAtmNonFraud[i]; out.printf("%d\t%d\t%d\t%d\t%d\t%d\t%.1f\n", i, atmTotal[i], k11, k12, k21, k22, LogLikelihood.rootLogLikelihoodRatio(k11, k12, k21, k22)); } } }
From source file:com.mapr.synth.TermGeneratorTest.java
License:Apache License
@Test public void distinctVocabularies() { TermGenerator x1 = new TermGenerator(WORDS, 1, 0.8); final Multiset<String> k1 = HashMultiset.create(); for (int i = 0; i < 50000; i++) { k1.add(x1.sample());/*from www . ja v a 2 s. co m*/ } TermGenerator x2 = new TermGenerator(WORDS, 1, 0.8); final Multiset<String> k2 = HashMultiset.create(); for (int i = 0; i < 50000; i++) { k2.add(x2.sample()); } final NormalDistribution normal = new NormalDistribution(); List<Double> scores = Ordering.natural() .sortedCopy(Iterables.transform(k1.elementSet(), new Function<String, Double>() { public Double apply(String s) { return normal.cumulativeProbability(LogLikelihood.rootLogLikelihoodRatio(k1.count(s), 50000 - k1.count(s), k2.count(s), 50000 - k2.count(s))); } })); int n = scores.size(); // System.out.printf("%.5f, %.5f, %.5f, %.5f, %.5f, %.5f, %.5f", scores.get(0), scores.get((int) (0.05*n)), scores.get(n / 4), scores.get(n / 2), scores.get(3 * n / 4), scores.get((int) (0.95 * n)), scores.get(n - 1)); int i = 0; for (Double score : scores) { if (i % 10 == 0) { System.out.printf("%.6f\t%.6f\n", (double) i / n, score); } i++; } }