java 之webmagic 网络爬虫
2020-12-13 01:53
数据库表SQL:
CREATE TABLE `Boke` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT ‘id‘, `title` varchar(255) DEFAULT NULL COMMENT ‘标题‘, `linke` varchar(255) DEFAULT NULL COMMENT ‘正文地址‘, `author` varchar(255) DEFAULT NULL COMMENT ‘作者‘, `authorUrl` varchar(255) DEFAULT NULL COMMENT ‘作者主页‘, `summary` varchar(1000) DEFAULT NULL COMMENT ‘简介‘, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
数据库链接工具类:
import java.sql.DriverManager;
import java.sql.SQLException;
import com.mysql.jdbc.Connection;
public class MySqlJdbcUtils {
private static String driver = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
private static String name="tradingbp";
private static String pwd="123456";
/**
*
* 获取链接
*
* @date 2017年8月31日
* @return
*/
public static Connection getOpenConnection(){
Connection conn= null;
try {
//加载驱动
Class.forName(driver);
conn=(Connection) DriverManager.getConnection(url, name, pwd);
System.out.println("获得数据库链接");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static void main(String[] args) {
getOpenConnection();
}
}
import java.sql.DriverManager; import java.sql.SQLException; import com.mysql.jdbc.Connection; public class MySqlJdbcUtils { private static String driver = "com.mysql.jdbc.Driver"; private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8"; private static String name="tradingbp"; private static String pwd="123456"; /** * * 获取链接 * * @date 2017年8月31日 * @return */ public static Connection getOpenConnection(){ Connection conn= null; try { //加载驱动 Class.forName(driver); conn=(Connection) DriverManager.getConnection(url, name, pwd); System.out.println("获得数据库链接"); } catch (ClassNotFoundException e) { e.printStackTrace(); }catch (SQLException e) { e.printStackTrace(); } return conn; } public static void main(String[] args) { getOpenConnection(); } }
实体类:
/**
*
*java 博客实体
*
* @date 2017年8月24日
* @see [相关类/方法]
* @since [产品/模块版本]
*/
public class JavaBokeModel {
//标题
private String title;
//链接地址
private String linke;
//作者
private String author;
//作者主页地址
private String authorUrl;
//简介
private String summary;
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getLinke() {
return linke;
}
public void setLinke(String linke) {
this.linke = linke;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getAuthorUrl() {
return authorUrl;
}
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
}
webmagic 框架爬取数据并保存
import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import com.mysql.jdbc.Connection; import com.nio.webmagic.jdbc.MySqlJdbcUtils; import com.nio.webmagic.model.JavaBokeModel; /** * * 爬虫 * * @version [VCES V201R001, 2017年10月12日] * * @see 方法实现 PageProcessor * @since [产品/模块版本] */ public class JavaBoKePageProcessor implements PageProcessor { private static Connection conn=null; private static PreparedStatement ps =null; //标题和链接获取 private static String TITLEQUERY="div.post_item_body h3 a.titlelnk"; //作者 private static String AUTHORQUERY="div.post_item_foot a.lightblue "; //简介 private static String SUMMARYQUERY="div.post_item_body p.post_item_summary"; //插入sql语句 private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)"; //初始链接 private static Connection getConnection(){ if (conn==null) { conn = MySqlJdbcUtils.getOpenConnection(); } return conn; } /** * * insert操作 * * @date 2017年8月31日 * @return */ private synchronized void insertDb(ListjavaBokes){ try { ps = conn.prepareStatement(insertSql); for (JavaBokeModel javaBoke:javaBokes) { ps.setString(1, javaBoke.getTitle().toString()); ps.setString(2, javaBoke.getLinke().toString()); ps.setString(3, javaBoke.getAuthor().toString()); ps.setString(4, javaBoke.getAuthorUrl().toString()); ps.setString(5, javaBoke.getSummary().toString()); ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //初始化带爬取网页地址 private static List urls(){ List listUrl =new ArrayList (); for (int i = 2; i htmls =page.getHtml().xpath("//div[@class=‘post_item‘]/html()").all(); List javaBokes=new ArrayList (); for (String html:htmls) { JavaBokeModel javaBoke =new JavaBokeModel(); //标题和链接 String title =seletDocumentText(html,TITLEQUERY); String linke =seletDocumentLink(html,TITLEQUERY); //作者和作者主页 String author=seletDocumentText(html, AUTHORQUERY); String authorUrl=seletDocumentLink(html, AUTHORQUERY); //简介 String summary=seletDocumentText(html, SUMMARYQUERY); javaBoke.setTitle(title); javaBoke.setAuthor(author); javaBoke.setAuthorUrl(authorUrl); javaBoke.setLinke(linke); javaBoke.setSummary(summary); javaBokes.add(javaBoke); } insertDb(javaBokes); } @Override public Site getSite() { //抓去网站的相关配置包括:编码、重试次数、抓取间隔 return Site.me().setSleepTime(1000).setRetryTimes(10); } public static void main(String[] args) { long startTime ,endTime; System.out.println("========小爬虫【启动】喽!========="); getConnection(); startTime = new Date().getTime(); //入口 Spider create = Spider.create(new JavaBoKePageProcessor()); //定义入口地址 create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); try { ps.close(); conn.close(); } catch (Exception e) { // TODO: handle exception } endTime = new Date().getTime(); System.out.println("========小爬虫【结束】喽!========="); System.out.println("用时为:"+(endTime-startTime)/1000+"s"); } }
数据: