nodejs .http模块, cheerio模块 实现 小爬虫.

2021-06-14 06:03

阅读:477

标签:class   load   text   blank   cti   rda   rar   course   课程   

 

代码:

 1 var http = require("http");
 2 
 3 var cheerio = require("cheerio");
 4 
 5 
 6 var url = ‘http://www.imooc.com/learn/348‘;
 7 
 8 
 9 http.get(url, function(res){
10     var html = ‘‘;
11 
12     res.on(‘data‘, function(data){
13         html += data;
14     });
15 
16     res.on(‘end‘, function(){
17         var courseData = filterChapters(html);
18 
19         printCourseInfo(courseData);
20         console.log(courseData);
21     });
22 }).on(‘error‘, function(){
23     console.log("获取课程数据出错!");
24 });
25 
26 function filterChapters(html)
27 {
28     var $ = cheerio.load(html);
29 
30     //所有章节
31     var chapters = $(‘.chapter‘);
32 
33     var courseData = [];
34 
35     chapters.each(function(item){
36         var chapter = $(this);
37         var chapterTitle = chapter.find(‘h3 strong‘).text().replace(/\r|\n/ig,"").trim();
38         var videos = chapter.find(".video").children(‘li‘);
39 
40         var chapterData = {
41             chapterTitle: chapterTitle,
42             videos: []
43         };
44 
45         videos.each(function(index, item2) {
46             var video = $(this).find(‘.J-media-item‘);
47             var videoTitle = video.text().replace(/\r|\n/ig,"").trim();
48             var id = video.attr(‘href‘).split(‘video/‘)[1];
49 
50             chapterData.videos.push({
51                 title: videoTitle,
52                 id: id
53             })
54         });
55 
56         courseData.push(chapterData);
57     });
58 
59     return courseData;
60 }
61 
62 
63 function printCourseInfo(courseData)
64 {
65     courseData.forEach(function(item){
66         var chapterTitle = item.chapterTitle;
67         console.log(chapterTitle + ‘\n‘);
68 
69         item.videos.forEach(function(video){
70             console.log(‘   [‘ + video.id+ ‘]‘ + video.title);
71         });
72     });
73 }

 

运行:

技术分享

 

 

 

----------------------------------------------------------------------

参考链接:

  • http小爬虫
  • Node.js的学习--使用cheerio抓取网页数据

 

nodejs .http模块, cheerio模块 实现 小爬虫.

标签:class   load   text   blank   cti   rda   rar   course   课程   

原文地址:http://www.cnblogs.com/cbza/p/7281367.html


评论


亲,登录后才可以留言!