com.voidsearch.voidbase.quant.feed.TokenFrequency.java Source code

Java tutorial

Introduction

Here is the source code for com.voidsearch.voidbase.quant.feed.TokenFrequency.java

Source

/*
 * Copyright 2009 VoidSearch.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.voidsearch.voidbase.quant.feed;

import com.voidsearch.voidbase.apps.queuetree.client.QueueTreeClient;
import com.voidsearch.voidbase.client.SimpleHttpClient;
import com.voidsearch.voidbase.quant.timeseries.NumericalSequence;
import com.voidsearch.voidbase.quant.timeseries.SequenceGenerator;
import org.apache.commons.lang.StringUtils;

import java.util.*;

/**
 * tracks token frequency on underlying stream data
 */

public class TokenFrequency extends NumericalSequence implements SequenceGenerator {

    private int MAX_TOKENS_LIMIT = 20;

    private SimpleHttpClient client;
    private String url;

    private HashMap<String, Integer> termFreq;
    private LinkedList<TokenEntry> topTerms;

    private HashMap<String, Integer> currentFreq;

    private QueueTreeClient queueClient = new QueueTreeClient();

    String tokenDelimiter = "\t";

    public class TokenEntry implements Comparable {

        String token;
        Integer count = 0;

        public TokenEntry(String token, int count) {
            this.token = token;
            this.count = count;
        }

        public void increment() {
            count++;
        }

        public String getToken() {
            return token;
        }

        public Integer getCount() {
            return count;
        }

        public boolean equals(Object o1) {
            return token.equals(((TokenEntry) o1).getToken());
        }

        public int compareTo(Object o1) {
            return count.compareTo(((TokenEntry) o1).getCount());
        }

        public String toString() {
            return token + "\t" + count;
        }

        public int hashCode() {
            return token.hashCode();
        }
    }

    public TokenFrequency(String requestURL) {
        init(requestURL);
    }

    public TokenFrequency(String requestURL, String tokenDelimiter) {
        this.tokenDelimiter = tokenDelimiter;
        init(requestURL);
    }

    private void init(String requestURL) {
        try {
            client = new SimpleHttpClient();
            url = requestURL;
            termFreq = new HashMap<String, Integer>();
            topTerms = new LinkedList<TokenEntry>();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * generate next sequence element
     *
     * @return
     */
    public double next() {
        try {
            dumpTopTerms();
            byte[] result = client.get(url);
            StringTokenizer tokenizer = new StringTokenizer(new String(result));
            int tokenCount = 0;
            currentFreq = new HashMap<String, Integer>();

            while (tokenizer.hasMoreTokens()) {
                String token = tokenizer.nextToken();
                if (isValidToken(token)) {

                    int count = 1;
                    if (termFreq.containsKey(token)) {
                        count = (Integer) termFreq.get(token) + 1;
                    }
                    termFreq.put(token, count);
                    topTerms.add(new TokenEntry(token, count));
                    tokenCount++;

                    // update local
                    if (currentFreq.containsKey(token)) {
                        currentFreq.put(token, currentFreq.get(token) + 1);
                    } else {
                        currentFreq.put(token, 1);
                    }
                }
            }
            return tokenCount;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return 0;
    }

    /**
     * cleanup frequency tree
     * invoked periodically on next() call
     */
    private void cleanupTree() {

    }

    private void dumpTopTerms() throws Exception {

        int termCnt = 0;

        Collections.sort(topTerms);
        HashSet<String> displaySet = new HashSet<String>();

        for (int i = topTerms.size() - 1; i >= 0; i--) {

            TokenEntry entry = topTerms.get(i);
            if (!displaySet.contains(entry.getToken())) {

                if (termCnt++ <= MAX_TOKENS_LIMIT) {

                    System.out.println(entry + "\t" + currentFreq.get(entry.getToken()));

                    // handle queue entry
                    Integer freq = 0;
                    if (currentFreq.containsKey(entry.getToken())) {
                        freq = currentFreq.get(entry.getToken());
                    }

                    queueClient.create(entry.getToken(), 1000);
                    queueClient.put(entry.getToken(), freq.toString());

                    displaySet.add(entry.getToken());

                } else {

                    // cleanup non-active queues
                    queueClient.delete(entry.getToken());

                }
            }
        }

    }

    /**
     * check whether token is valid
     *
     * @param token
     * @return
     */
    private boolean isValidToken(String token) {
        // filter empty strings
        //if (token.isEmpty()) {
        if ((token == null) || (token.length() == 0)) {
            return false;
        }
        // filter html tags
        if (token.contains(">") || token.contains("<")) {
            return false;
        }

        // filter min lenght
        if (token.length() < 5) {
            return false;
        }

        if (!StringUtils.isAlpha(token)) {
            return false;
        }

        return true;
    }

}