cn.edu.hfut.dmic.webcollector.example.TutorialCrawler2.java Source code

Java tutorial

Introduction

Here is the source code for cn.edu.hfut.dmic.webcollector.example.TutorialCrawler2.java

Source

/*
 * Copyright (C) 2015 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

package cn.edu.hfut.dmic.webcollector.example;

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.example.util.JDBCHelper;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.Proxys;
import org.jsoup.nodes.Document;
import org.springframework.jdbc.core.JdbcTemplate;

/**
 * WebCollector 2.xtutorial
 * 2.x
 *   1?????????AJAX
 *   2Berkeley DB?URL???
 *   3?selenium?javascript???
 *   4????
 *   5?spring jdbcmysql connection??
 *   6?json?
 *   7slf4j?
 *   8http?http
 * 
 * ?cn.edu.hfut.dmic.webcollector.example?(Demo)
 * 
 * @author hu
 */
public class TutorialCrawler2 extends BreadthCrawler {

    /**
     * ???????visit
     * @param page
     * @param nextLinks ???URLautoParsetrue??nextLinks
     */
    @Override
    public void visit(Page page, Links nextLinks) {
        Document doc = page.getDoc();
        String title = doc.title();
        System.out.println("URL:" + page.getUrl() + "  :" + title);

        /*??mysql*/
        if (jdbcTemplate != null) {
            int updates = jdbcTemplate.update("insert into tb_content (title,url,html) value(?,?,?)", title,
                    page.getUrl(), page.getHtml());
            if (updates == 1) {
                System.out.println("mysql??");
            }
        }

        /*
        //nextLinksx?URL?????URL
        //?????
        //?????Crawler.addForcedSeedURL
         nextLinks.add("http://www.csdn.net");
        */
    }

    JdbcTemplate jdbcTemplate = null;

    /*autoParsetrue??????????*/
    public TutorialCrawler2(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);

        /*BreadthCrawler?URL*/
        this.addRegex("http://.*zhihu.com/.*");
        this.addRegex("-.*jpg.*");

        /*JdbcTemplate,"mysql1"????
         JDBCHelper.getJdbcTemplate("mysql1")??
         ????URL????????
            
         JdbcTemplate???
         JdbcTemplate(?JDBCHelper.getJdbcTemplate("??")
         ??JdbcTemplate)             
         */
        try {
            jdbcTemplate = JDBCHelper.createMysqlTemplate("mysql1",
                    "jdbc:mysql://localhost/testdb?useUnicode=true&characterEncoding=utf8", "root", "password", 5,
                    30);

            /*?*/
            jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS tb_content (" + "id int(11) NOT NULL AUTO_INCREMENT,"
                    + "title varchar(50),url varchar(200),html longtext," + "PRIMARY KEY (id)"
                    + ") ENGINE=MyISAM DEFAULT CHARSET=utf8;");
            System.out.println("?? tb_content");
        } catch (Exception ex) {
            jdbcTemplate = null;
            System.out.println("mysql?JDBCHelper.createMysqlTemplate???!");
        }

    }

    public static void main(String[] args) throws Exception {
        /*
           ?crawlPathcrawlPathURL?????crawlPath
           ?????
        */
        TutorialCrawler2 crawler = new TutorialCrawler2("/home/hu/data/wb", true);
        crawler.setThreads(50);
        crawler.addSeed("http://www.zhihu.com/");
        crawler.setResumable(false);

        /*
        //requester??http??requester?http/socks?
        HttpRequesterImpl requester=(HttpRequesterImpl) crawler.getHttpRequester();    
            
        //??
        requester.setProxy("127.0.0.1", 1080,Proxy.Type.SOCKS);
            
        //??
        RandomProxyGenerator proxyGenerator=new RandomProxyGenerator();
        proxyGenerator.addProxy("127.0.0.1",8080,Proxy.Type.SOCKS);
        requester.setProxyGenerator(proxyGenerator);
        */

        /*??*/
        crawler.setResumable(false);
        /*???URL?*/
        crawler.setTopN(100);

        /*????URL?*/
        crawler.start(5);
    }

}