com.spider.webmagic.scheduler.WechatDBDuplicateRemover.java Source code

Java tutorial

Introduction

Here is the source code for com.spider.webmagic.scheduler.WechatDBDuplicateRemover.java

Source

/*******************************************************************************
 * @(#)WechatDBDuplicateRemover.java May 10, 2016
 *
 * Copyright 2016 Hengtian Soft Group Ltd. All rights reserved.
 * Hengtian Soft PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 *******************************************************************************/
package com.spider.webmagic.scheduler;

import java.nio.charset.Charset;
import java.util.concurrent.atomic.AtomicInteger;

import org.springframework.context.ApplicationContext;

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;

import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;

/**
 * @author <a href="mailto:ruiyuanyu@hengtiansoft.com"> ruiyuanyu</a>
 * @version $Revision 1.1 $ May 10, 2016 3:21:56 PM
 */
public class WechatDBDuplicateRemover implements DuplicateRemover {

    private int expectedInsertions;

    private double fpp;

    private AtomicInteger counter;

    private BloomFilter<CharSequence> bloomFilter;

    public WechatDBDuplicateRemover(int expectedInsertions, double fpp, String tablename,
            ApplicationContext mybatisContext, String type) {
        this.expectedInsertions = expectedInsertions;
        this.fpp = fpp;
        this.bloomFilter = new DBReadUrls(tablename, mybatisContext, type).readHistoryDBTitle(rebuildBloomFilter());
    }

    protected BloomFilter<CharSequence> rebuildBloomFilter() {
        counter = new AtomicInteger(0);
        return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
    }

    @Override
    public synchronized boolean isDuplicate(Request request, Task task) {
        boolean isDuplicate = bloomFilter.mightContain(getTitle(request));
        if (!isDuplicate) {
            bloomFilter.put(getTitle(request));
            counter.incrementAndGet();
        }
        return isDuplicate;
    }

    public boolean isQueueDuplicate(Request request, Task task) {
        boolean isDuplicate = bloomFilter.mightContain(getTitle(request));
        if (!isDuplicate) {
            bloomFilter.put(getTitle(request));
        }
        return isDuplicate;
    }

    protected String getTitle(Request request) {
        return request.getTitle();
    }

    @Override
    public void resetDuplicateCheck(Task task) {
        rebuildBloomFilter();
    }

    @Override
    public int getTotalRequestsCount(Task task) {
        return counter.get();
    }

    public BloomFilter<CharSequence> getBloomFilter() {
        return bloomFilter;
    }

    public void setBloomFilter(BloomFilter<CharSequence> bloomFilter) {
        this.bloomFilter = bloomFilter;
    }

}