Java tutorial
/* * Copyright (C) 2015 hu * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package cn.edu.hfut.dmic.webcollector.example; import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler; import cn.edu.hfut.dmic.webcollector.example.util.JDBCHelper; import cn.edu.hfut.dmic.webcollector.model.Links; import cn.edu.hfut.dmic.webcollector.model.Page; import cn.edu.hfut.dmic.webcollector.net.Proxys; import org.jsoup.nodes.Document; import org.springframework.jdbc.core.JdbcTemplate; /** * WebCollector 2.xtutorial * 2.x * 1?????????AJAX * 2Berkeley DB?URL??? * 3?selenium?javascript??? * 4???? * 5?spring jdbcmysql connection?? * 6?json? * 7slf4j? * 8http?http * * ?cn.edu.hfut.dmic.webcollector.example?(Demo) * * @author hu */ public class TutorialCrawler2 extends BreadthCrawler { /** * ???????visit * @param page * @param nextLinks ???URLautoParsetrue??nextLinks */ @Override public void visit(Page page, Links nextLinks) { Document doc = page.getDoc(); String title = doc.title(); System.out.println("URL:" + page.getUrl() + " :" + title); /*??mysql*/ if (jdbcTemplate != null) { int updates = jdbcTemplate.update("insert into tb_content (title,url,html) value(?,?,?)", title, page.getUrl(), page.getHtml()); if (updates == 1) { System.out.println("mysql??"); } } /* //nextLinksx?URL?????URL //????? //?????Crawler.addForcedSeedURL nextLinks.add("http://www.csdn.net"); */ } JdbcTemplate jdbcTemplate = null; /*autoParsetrue??????????*/ public TutorialCrawler2(String crawlPath, boolean autoParse) { super(crawlPath, autoParse); /*BreadthCrawler?URL*/ this.addRegex("http://.*zhihu.com/.*"); this.addRegex("-.*jpg.*"); /*JdbcTemplate,"mysql1"???? JDBCHelper.getJdbcTemplate("mysql1")?? ????URL???????? JdbcTemplate??? JdbcTemplate(?JDBCHelper.getJdbcTemplate("??") ??JdbcTemplate) */ try { jdbcTemplate = JDBCHelper.createMysqlTemplate("mysql1", "jdbc:mysql://localhost/testdb?useUnicode=true&characterEncoding=utf8", "root", "password", 5, 30); /*?*/ jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS tb_content (" + "id int(11) NOT NULL AUTO_INCREMENT," + "title varchar(50),url varchar(200),html longtext," + "PRIMARY KEY (id)" + ") ENGINE=MyISAM DEFAULT CHARSET=utf8;"); System.out.println("?? tb_content"); } catch (Exception ex) { jdbcTemplate = null; System.out.println("mysql?JDBCHelper.createMysqlTemplate???!"); } } public static void main(String[] args) throws Exception { /* ?crawlPathcrawlPathURL?????crawlPath ????? */ TutorialCrawler2 crawler = new TutorialCrawler2("/home/hu/data/wb", true); crawler.setThreads(50); crawler.addSeed("http://www.zhihu.com/"); crawler.setResumable(false); /* //requester??http??requester?http/socks? HttpRequesterImpl requester=(HttpRequesterImpl) crawler.getHttpRequester(); //?? requester.setProxy("127.0.0.1", 1080,Proxy.Type.SOCKS); //?? RandomProxyGenerator proxyGenerator=new RandomProxyGenerator(); proxyGenerator.addProxy("127.0.0.1",8080,Proxy.Type.SOCKS); requester.setProxyGenerator(proxyGenerator); */ /*??*/ crawler.setResumable(false); /*???URL?*/ crawler.setTopN(100); /*????URL?*/ crawler.start(5); } }