org.apache.kylin.measure.topn.TopNCounterTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.kylin.measure.topn.TopNCounterTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

package org.apache.kylin.measure.topn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.math3.distribution.ZipfDistribution;
import org.apache.kylin.common.util.Pair;
import org.junit.Ignore;
import org.junit.Test;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

@Ignore("For collecting accuracy statistics, not for functional test")
public class TopNCounterTest {

    protected static int TOP_K;

    protected static int KEY_SPACE;

    protected static int TOTAL_RECORDS;

    protected static int SPACE_SAVING_ROOM;

    protected static int PARALLEL = 10;

    protected static boolean verbose = true;

    public TopNCounterTest() {
        TOP_K = 100;
        KEY_SPACE = 100 * TOP_K;
        TOTAL_RECORDS = 1000000; // 1 million
        SPACE_SAVING_ROOM = 100;
    }

    protected String prepareTestDate() throws IOException {
        String[] allKeys = new String[KEY_SPACE];

        for (int i = 0; i < KEY_SPACE; i++) {
            allKeys[i] = RandomStringUtils.randomAlphabetic(10);
        }

        outputMsg("Start to create test random data...");
        long startTime = System.currentTimeMillis();
        ZipfDistribution zipf = new ZipfDistribution(KEY_SPACE, 0.5);
        int keyIndex;

        File tempFile = File.createTempFile("ZipfDistribution", ".txt");

        if (tempFile.exists())
            FileUtils.forceDelete(tempFile);
        FileWriter fw = new FileWriter(tempFile);
        try {
            for (int i = 0; i < TOTAL_RECORDS; i++) {
                keyIndex = zipf.sample() - 1;
                fw.write(allKeys[keyIndex]);
                fw.write('\n');
            }
        } finally {
            if (fw != null)
                fw.close();
        }

        outputMsg("Create test data takes : " + (System.currentTimeMillis() - startTime) / 1000 + " seconds.");
        outputMsg("Test data in : " + tempFile.getAbsolutePath());

        return tempFile.getAbsolutePath();
    }

    //@Test
    public void testSingleSpaceSaving() throws IOException {
        String dataFile = prepareTestDate();
        TopNCounterTest.SpaceSavingConsumer spaceSavingCounter = new TopNCounterTest.SpaceSavingConsumer(
                TOP_K * SPACE_SAVING_ROOM);
        TopNCounterTest.HashMapConsumer accurateCounter = new TopNCounterTest.HashMapConsumer();

        for (TopNCounterTest.TestDataConsumer consumer : new TopNCounterTest.TestDataConsumer[] {
                spaceSavingCounter, accurateCounter }) {
            feedDataToConsumer(dataFile, consumer, 0, TOTAL_RECORDS);
        }

        FileUtils.forceDelete(new File(dataFile));

        compareResult(spaceSavingCounter, accurateCounter);
    }

    private void compareResult(TopNCounterTest.TestDataConsumer firstConsumer,
            TopNCounterTest.TestDataConsumer secondConsumer) {
        List<Pair<String, Double>> topResult1 = firstConsumer.getTopN(TOP_K);
        outputMsg("Get topN, Space saving takes " + firstConsumer.getSpentTime() / 1000 + " seconds");
        List<Pair<String, Double>> realSequence = secondConsumer.getTopN(TOP_K);
        outputMsg("Get topN, Merge sort takes " + secondConsumer.getSpentTime() / 1000 + " seconds");

        int error = 0;
        for (int i = 0; i < topResult1.size(); i++) {
            outputMsg("Compare " + i);

            if (isClose(topResult1.get(i).getSecond().doubleValue(),
                    realSequence.get(i).getSecond().doubleValue())) {
                //            if (topResult1.get(i).getFirst().equals(realSequence.get(i).getFirst()) && topResult1.get(i).getSecond().doubleValue() == realSequence.get(i).getSecond().doubleValue()) {
                outputMsg(
                        "Passed; key:" + topResult1.get(i).getFirst() + ", value:" + topResult1.get(i).getSecond());
            } else {
                outputMsg("Failed; space saving key:" + topResult1.get(i).getFirst() + ", value:"
                        + topResult1.get(i).getSecond());
                outputMsg("Failed; correct key:" + realSequence.get(i).getFirst() + ", value:"
                        + realSequence.get(i).getSecond());
                error++;
            }
        }

        org.junit.Assert.assertEquals(0, error);
    }

    private boolean isClose(double value1, double value2) {

        if (Math.abs(value1 - value2) < 5.0)
            return true;

        return false;
    }

    @Test
    public void testParallelSpaceSaving() throws IOException, ClassNotFoundException {
        String dataFile = prepareTestDate();

        TopNCounterTest.SpaceSavingConsumer[] parallelCounters = new TopNCounterTest.SpaceSavingConsumer[PARALLEL];

        for (int i = 0; i < PARALLEL; i++) {
            parallelCounters[i] = new TopNCounterTest.SpaceSavingConsumer(TOP_K * SPACE_SAVING_ROOM);
        }

        int slice = TOTAL_RECORDS / PARALLEL;
        int startPosition = 0;
        for (int i = 0; i < PARALLEL; i++) {
            feedDataToConsumer(dataFile, parallelCounters[i], startPosition, startPosition + slice);
            startPosition += slice;
        }

        TopNCounterTest.SpaceSavingConsumer[] mergedCounters = singleMerge(parallelCounters);

        TopNCounterTest.HashMapConsumer accurateCounter = new TopNCounterTest.HashMapConsumer();
        feedDataToConsumer(dataFile, accurateCounter, 0, TOTAL_RECORDS);

        compareResult(mergedCounters[0], accurateCounter);
        FileUtils.forceDelete(new File(dataFile));

    }

    private TopNCounterTest.SpaceSavingConsumer[] singleMerge(TopNCounterTest.SpaceSavingConsumer[] consumers)
            throws IOException, ClassNotFoundException {
        List<TopNCounterTest.SpaceSavingConsumer> list = Lists.newArrayList();
        if (consumers.length == 1)
            return consumers;

        TopNCounterTest.SpaceSavingConsumer merged = new TopNCounterTest.SpaceSavingConsumer(
                TOP_K * SPACE_SAVING_ROOM);

        for (int i = 0, n = consumers.length; i < n; i++) {
            merged.vs.merge(consumers[i].vs);
        }

        merged.vs.retain(TOP_K * SPACE_SAVING_ROOM); // remove extra elements;
        return new TopNCounterTest.SpaceSavingConsumer[] { merged };

    }

    private TopNCounterTest.SpaceSavingConsumer[] binaryMerge(TopNCounterTest.SpaceSavingConsumer[] consumers)
            throws IOException, ClassNotFoundException {
        List<TopNCounterTest.SpaceSavingConsumer> list = Lists.newArrayList();
        if (consumers.length == 1)
            return consumers;

        for (int i = 0, n = consumers.length; i < n; i = i + 2) {
            if (i + 1 < n) {
                consumers[i].vs.merge(consumers[i + 1].vs);
            }

            list.add(consumers[i]);
        }

        return binaryMerge(list.toArray(new TopNCounterTest.SpaceSavingConsumer[list.size()]));
    }

    private void feedDataToConsumer(String dataFile, TopNCounterTest.TestDataConsumer consumer, int startLine,
            int endLine) throws IOException {
        long startTime = System.currentTimeMillis();
        BufferedReader bufferedReader = new BufferedReader(new FileReader(dataFile));

        int lineNum = 0;
        String line = bufferedReader.readLine();
        while (line != null) {
            if (lineNum >= startLine && lineNum < endLine) {
                consumer.addElement(line, 1.0);
            }
            line = bufferedReader.readLine();
            lineNum++;
        }

        bufferedReader.close();
        outputMsg("feed data to " + consumer.getClass().getCanonicalName() + " take time (seconds): "
                + (System.currentTimeMillis() - startTime) / 1000);
    }

    private void outputMsg(String msg) {
        if (verbose)
            System.out.println(msg);
    }

    private static interface TestDataConsumer {
        public void addElement(String elementKey, double value);

        public List<Pair<String, Double>> getTopN(int k);

        public long getSpentTime();
    }

    private class SpaceSavingConsumer implements TopNCounterTest.TestDataConsumer {
        private long timeSpent = 0;
        protected TopNCounter<String> vs;

        public SpaceSavingConsumer(int space) {
            vs = new TopNCounter<String>(space);

        }

        public void addElement(String key, double value) {
            //outputMsg("Adding " + key + ":" + incrementCount);
            long startTime = System.currentTimeMillis();
            vs.offer(key, value);
            timeSpent += (System.currentTimeMillis() - startTime);
        }

        @Override
        public List<Pair<String, Double>> getTopN(int k) {
            long startTime = System.currentTimeMillis();
            List<Counter<String>> tops = vs.topK(k);
            List<Pair<String, Double>> allRecords = Lists.newArrayList();

            for (Counter<String> counter : tops)
                allRecords.add(Pair.newPair(counter.getItem(), counter.getCount()));
            timeSpent += (System.currentTimeMillis() - startTime);
            return allRecords;
        }

        @Override
        public long getSpentTime() {
            return timeSpent;
        }
    }

    private class HashMapConsumer implements TopNCounterTest.TestDataConsumer {

        private long timeSpent = 0;
        private Map<String, Double> hashMap;

        public HashMapConsumer() {
            hashMap = Maps.newHashMap();
        }

        public void addElement(String key, double value) {
            long startTime = System.currentTimeMillis();
            if (hashMap.containsKey(key)) {
                hashMap.put(key, hashMap.get(key) + value);
            } else {
                hashMap.put(key, value);
            }
            timeSpent += (System.currentTimeMillis() - startTime);
        }

        @Override
        public List<Pair<String, Double>> getTopN(int k) {
            long startTime = System.currentTimeMillis();
            List<Pair<String, Double>> allRecords = Lists.newArrayList();

            for (Map.Entry<String, Double> entry : hashMap.entrySet()) {
                allRecords.add(Pair.newPair(entry.getKey(), entry.getValue()));
            }

            Collections.sort(allRecords, new Comparator<Pair<String, Double>>() {
                @Override
                public int compare(Pair<String, Double> o1, Pair<String, Double> o2) {
                    return o1.getSecond() < o2.getSecond() ? 1 : (o1.getSecond() > o2.getSecond() ? -1 : 0);
                }
            });
            timeSpent += (System.currentTimeMillis() - startTime);
            return allRecords.subList(0, k);
        }

        @Override
        public long getSpentTime() {
            return timeSpent;
        }
    }

}