java 之webmagic 网络爬虫

2020-12-13 01:53

阅读:645

   数据库表SQL:

技术图片
CREATE TABLE `Boke` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT ‘id‘,
  `title` varchar(255) DEFAULT NULL COMMENT ‘标题‘,
  `linke` varchar(255) DEFAULT NULL COMMENT ‘正文地址‘,
  `author` varchar(255) DEFAULT NULL COMMENT ‘作者‘,
  `authorUrl` varchar(255) DEFAULT NULL COMMENT ‘作者主页‘,
  `summary` varchar(1000) DEFAULT NULL COMMENT ‘简介‘,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
技术图片

 

 

 数据库链接工具类:

import java.sql.DriverManager;
import java.sql.SQLException;

import com.mysql.jdbc.Connection;

public class MySqlJdbcUtils {

private static String driver = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
private static String name="tradingbp";
private static String pwd="123456";

/**
*
* 获取链接
*
* @date 2017年8月31日
* @return
*/
public static Connection getOpenConnection(){
Connection conn= null;
try {
//加载驱动
Class.forName(driver);
conn=(Connection) DriverManager.getConnection(url, name, pwd);
System.out.println("获得数据库链接");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (SQLException e) {
e.printStackTrace();
}
return conn;
}

public static void main(String[] args) {
getOpenConnection();
}

}


技术图片
import java.sql.DriverManager;
import java.sql.SQLException;

import com.mysql.jdbc.Connection;

public class MySqlJdbcUtils {

    private static String driver = "com.mysql.jdbc.Driver";
    private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
    private static String name="tradingbp";
    private static String pwd="123456";
    
    /**
     * 
     * 获取链接
     *
     * @date   2017年8月31日
     * @return
     */
    public static Connection getOpenConnection(){
        Connection conn= null;
        try {
            //加载驱动
            Class.forName(driver);
            conn=(Connection) DriverManager.getConnection(url, name, pwd);
            System.out.println("获得数据库链接");
        } catch (ClassNotFoundException  e) {
             e.printStackTrace();
        }catch (SQLException e) {
            e.printStackTrace();
        }
        return conn;
    }
    
    public static void main(String[] args) {
        getOpenConnection();
    }
    
}
技术图片

 

 实体类:

/**
*
*java 博客实体
*
* @date 2017年8月24日
* @see [相关类/方法]
* @since [产品/模块版本]
*/
public class JavaBokeModel {

//标题
private String title;

//链接地址
private String linke;

//作者
private String author;

//作者主页地址
private String authorUrl;

//简介
private String summary;


public String getSummary() {
return summary;
}

public void setSummary(String summary) {
this.summary = summary;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getLinke() {
return linke;
}

public void setLinke(String linke) {
this.linke = linke;
}

public String getAuthor() {
return author;
}

public void setAuthor(String author) {
this.author = author;
}

public String getAuthorUrl() {
return authorUrl;
}

public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}


}

webmagic 框架爬取数据并保存

   

技术图片
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import com.mysql.jdbc.Connection;
import com.nio.webmagic.jdbc.MySqlJdbcUtils;
import com.nio.webmagic.model.JavaBokeModel;
/**
 * 
 * 爬虫
 *
 * @version  [VCES V201R001, 2017年10月12日]
 *
 * @see 方法实现 PageProcessor 
 * @since  [产品/模块版本]
 */
public class JavaBoKePageProcessor implements PageProcessor {
    private static Connection conn=null;
    private static PreparedStatement ps =null;
    //标题和链接获取
    private static String  TITLEQUERY="div.post_item_body h3 a.titlelnk";
    //作者
    private static String AUTHORQUERY="div.post_item_foot a.lightblue ";
    //简介
    private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";
    //插入sql语句
    private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";
    
    //初始链接
    private static Connection getConnection(){
        if (conn==null) {
            conn = MySqlJdbcUtils.getOpenConnection();
        }
        return conn;
    }
    
    /**
     * 
     * insert操作
     *
     * @date   2017年8月31日
     * @return
     */
    
    private synchronized void insertDb(List javaBokes){
        try {
                
             ps = conn.prepareStatement(insertSql);
            
            for (JavaBokeModel javaBoke:javaBokes) {
                ps.setString(1, javaBoke.getTitle().toString());
                ps.setString(2, javaBoke.getLinke().toString());
                ps.setString(3, javaBoke.getAuthor().toString());
                ps.setString(4, javaBoke.getAuthorUrl().toString());
                ps.setString(5, javaBoke.getSummary().toString());
                ps.executeUpdate();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    
    //初始化带爬取网页地址
    private static List urls(){
        List listUrl =new ArrayList();
        
        for (int i = 2; i  htmls =page.getHtml().xpath("//div[@class=‘post_item‘]/html()").all();
        List javaBokes=new ArrayList();
        for (String html:htmls) {
            JavaBokeModel javaBoke =new JavaBokeModel();
            //标题和链接
            String title =seletDocumentText(html,TITLEQUERY);
            String linke =seletDocumentLink(html,TITLEQUERY);
            //作者和作者主页
            String author=seletDocumentText(html, AUTHORQUERY);
            String authorUrl=seletDocumentLink(html, AUTHORQUERY);
            //简介
            String summary=seletDocumentText(html, SUMMARYQUERY);
            javaBoke.setTitle(title);
            javaBoke.setAuthor(author);
            javaBoke.setAuthorUrl(authorUrl);
            javaBoke.setLinke(linke);
            javaBoke.setSummary(summary);
            javaBokes.add(javaBoke);
            
        }
        insertDb(javaBokes);
        
    }

    @Override
    public Site getSite() {
        //抓去网站的相关配置包括:编码、重试次数、抓取间隔
        return Site.me().setSleepTime(1000).setRetryTimes(10);
    }
    
    public static void main(String[] args) {
        long startTime ,endTime;
        System.out.println("========小爬虫【启动】喽!=========");
        getConnection();
        startTime = new Date().getTime();
        //入口
        Spider create = Spider.create(new JavaBoKePageProcessor());
        //定义入口地址
        create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); 
        try {
            ps.close();
            conn.close();
        } catch (Exception e) {
            // TODO: handle exception
        }
        endTime = new Date().getTime();
        System.out.println("========小爬虫【结束】喽!=========");
        System.out.println("用时为:"+(endTime-startTime)/1000+"s");
    }

}
技术图片

 

数据:技术图片


评论


亲,登录后才可以留言!