记一次nodejs 爬虫(利用递归循环nightmare)
2021-04-02 17:25
标签:win tor UNC query orm 验证 clear set firefox 新手,欢迎交流 这里的网址很有规律,方便我们获取图书列表的url nightmare几个主要的api 具体参考:http://www.manongjc.com/detail/8-roxmpabfhewimht.html .goto(url,options) .wait(callback[selector]) 这个方法会重复调用,直到 可以传入选择器,如: .wait(‘body‘) 意指等待body加载完成 .inject(type, file) 注入本地文件,type: css js .evaluate(callback) 这里的返回值可以在 .then() 获得处理结果 .catch() 抛出错误 .end() 关闭浏览器,注意关闭后调用不了goto 方法,如果需要循环请求url,不能调此方法 note: 运行结果保存在数据库中: note: 原本打算爬取100页的书籍列表,中途出现验证码 信息比较多,选取我们需要的 好在查看详情已经在服务器渲染好了,不需要我们模拟点击 运行数据库结果: 记一次nodejs 爬虫(利用递归循环nightmare) 标签:win tor UNC query orm 验证 clear set firefox 原文地址:https://www.cnblogs.com/juvenileLin/p/12552131.html记一次nodejs 爬虫(利用递归循环nightmare)
目标网站
url
:目标网站 options
:伪造头部信息return true
,.then
方法中接收到.wait
&.evaluate
中的代码要做浏览器中运行爬取书籍列表
var Nightmare = require(‘nightmare‘);
var nightmare = Nightmare({ show: true });
var fs = require(‘fs‘);
var mysql = require(‘mysql‘);
var async = require(‘async‘);
let options = {
host:‘localhost‘,
port:‘3306‘,
user:‘root‘,
password:‘root‘,
database:‘juveniledata‘
}
var connection = mysql.createConnection(options);
connection.connect(function(err){
if(err){
console.log(err);
}else{
console.log(‘database连接成功‘)
}
});
function funcasy(i,connection){
nightmare
.goto(`http://item.kongfz.com/Cxiaoshuo/tag_k34k33k30k30k35w${i}/`,{
‘User-Agent‘: ‘Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; en) AppleWebKit/534.46.0 (KHTML, like Gecko) CriOS/19.0.1084.60 Mobile/9B206 Safari/7534.48.3‘,
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘,
‘Cache-Control‘: ‘no-cache‘
})
.wait(function(){
if(document.getElementsByClassName(‘result-list‘)[0]) return true;
})
.wait(function(){
window.lyharr = [];
return true;
})
.wait(function(){
var gethreflyh = document.querySelectorAll(‘#listBox .item-img a:first-of-type‘);
for(let j=0;j
根据书籍列表的url,获取具体的书籍信息
var Nightmare = require(‘nightmare‘);
var nightmare = Nightmare({ show: true });
var fs = require(‘fs‘);
var mysql = require(‘mysql‘);
var async = require(‘async‘);
let options = {
host:‘localhost‘,
port:‘3306‘,
user:‘root‘,
password:‘root‘,
database:‘juveniledata‘
}
var connection = mysql.createConnection(options);
connection.connect(function(err){
if(err){
console.log(err);
}else{
console.log(‘database连接成功‘)
}
});
function getbookinfo(i,connection){
let sqlstr = `select bookurl2 from bookurl2 where id = "${i}" `;
connection.query(sqlstr,function(err,results,fields){
if(err) console.log(err);
console.log(results[0].bookurl2);
nightmare
.goto(results[0].bookurl2,{
"Accept" : "application/jason, text/javascript, */*; q = 0.01",
"X-Request-With" : "XMLHttpRequest",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; ...) Gecko/20100101 Firefox/60.0",
"Content-Type" : "application/x-www-form-urlencode:chartset=UTF-8"
})
.wait(function(){
window.lyharr = [];
return true;
})
.inject(‘js‘, ‘jquery-3.4.1.min.js‘)
.wait(‘.detail-con .clearfix‘)
.wait(function(){
window.lyharr.push($(‘h1[class="detail-title"]‘).text());// 书名
window.lyharr.push(($(‘span:contains("作者")‘).next()).text().replace(/\ +/g,""));
window.lyharr.push(($(‘span:contains("出版社")‘).next()).text());
window.lyharr.push(($(‘span:contains("出版时间")‘).next()).text());
window.lyharr.push(($(‘span:contains("ISBN")‘).next()).text());
window.lyharr.push(($(‘span:contains("定价")‘).next()).text());
window.lyharr.push(($(‘span:contains("页数")‘).next()).text());
window.lyharr.push(($(‘span:contains("分类")‘).next()).text());
// 内容简介
window.lyharr.push($(‘li:contains("内容简介")‘).text().replace(/\ +/g,""));
// 作者简介
window.lyharr.push($(‘li:contains("作者简介")‘).text().replace(/\ +/g,""));
// 图片地址
window.lyharr.push($(‘div[class="detail-img"] a:first-of-type‘).attr(‘href‘));
return true;
})
.evaluate(function(){
return window.lyharr;
})
.then(function(results){
// console.log(results);
let sqlsql = `insert into bookinfo2 values(null,"${results[0]}","${results[1]}","${results[2]}","${results[3]}","${results[4]}","${results[5]}","${results[6]}","${results[7]}","${results[8]}","${results[9]}","${results[10]}")`;
connection.query(sqlsql,function(err,results,fields){
if(err)console.log(err);
console.log(‘database sucess‘);
if(i
切记:循环时,不能调用 .end() 方法
文章标题:记一次nodejs 爬虫(利用递归循环nightmare)
文章链接:http://soscw.com/essay/71470.html