.net使用abot爬虫简单例子
2021-06-26 18:03
标签:exce second 为我 tps page seconds query div 自己 abot是.net爬虫框架中的一种,Abot是一个开源的.net爬虫,速度快,易于使用和扩展。项目的地址是https://code.google.com/p/abot/。 爬取的html解析,我们使用AngleSharp,项目的地址:https://github.com/AngleSharp/AngleSharp 首先我们需要配置abot //下面是爬取的四个事件
crawler.PageCrawlStartingAsync 爬虫中主要是4个事件, 页面爬取开始、页面爬取失败、页面不允许爬取事件、页面中的链接不允许爬取事件. 以下是示例 接下来就是测试 .net使用abot爬虫简单例子 标签:exce second 为我 tps page seconds query div 自己 原文地址:http://www.cnblogs.com/yuanxinSix/p/7151375.html private static readonly Uri FeedUrl = new Uri("https://www.jd.com/allSort.aspx");//定义一个爬取的url,这里以京东商品的分类为例子
public static IWebCrawler GetManuallyConfiguredWebCrawler()
{
//这里进行配置,具体的含义自己看源代码了解
CrawlConfiguration config = new CrawlConfiguration();
config.MaxConcurrentThreads = System.Environment.ProcessorCount;
config.MaxPagesToCrawl = 1000;
config.IsExternalPageCrawlingEnabled = false;
config.IsUriRecrawlingEnabled = false;
config.IsExternalPageLinksCrawlingEnabled = false;
config.IsRespectRobotsDotTextEnabled = false;
config.DownloadableContentTypes = "text/html, text/plain";
config.MinCrawlDelayPerDomainMilliSeconds = 1000;
config.CrawlTimeoutSeconds = 0;
config.MaxPagesToCrawlPerDomain = 0;
var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);
//爬取页面前的判断
crawler.ShouldCrawlPage(ShouldCrawlPage);
crawler.ShouldDownloadPageContent(ShouldDownloadPageContent);
crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks); //单个页面爬取开始
public static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
}
//单个页面爬取结束
public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
{
if (e.CrawledPage.Uri == FeedUrl)
{
StringBuilder sb=new StringBuilder();
//这里使用AngleSharp解析html
var all=e.CrawledPage.AngleSharpHtmlDocument.QuerySelector(".category-items").Children;
foreach (var col in all)
{
var categorys=col.QuerySelectorAll(".category-item");
foreach (var category in categorys)
{
var first=category.QuerySelector(".item-title span").Text();
sb.Append("\r\n" + first + "\r\n");
var seconds = category.QuerySelector(".items").Children;
foreach (var second in seconds)
{
var secondtext=second.QuerySelector("dt a").Text();
sb.Append(secondtext + "\t");
var thireds = second.QuerySelector("dd").Children;
foreach (var thired in thireds)
{
var thiredtext = thired.Text();
sb.Append(thiredtext + ",");
}
sb.Remove(sb.Length - 1, 1);
}
}
}
//爬取的数据保存到C:\Program Files (x86)\IIS Express下面。注意这里保存可能需要以管理员的身份运行VS
System.IO.File.AppendAllText("fake.txt", sb.ToString());
}
}
#region
/// public ActionResult Index()
{
var crawler = GetManuallyConfiguredWebCrawler();
var reuslt = crawler.Crawl(FeedUrl);
Response.Write(reuslt.ErrorException);
return View();
}