写了段批量抓取某个列表页的东东

2018-09-06 12:38

阅读:682

  有些人当抓取程序是个宝,到目前还TND有人在卖钱,强烈BS一下这些家伙真是的!可能偶下边这段东西比较烂哈
下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果

DimUrl,List_PageCode,Array_ArticleID,i,ArticleID
DimContent_PageCode,Content_TempCode
DimContent_CategoryID,Content_CategoryName,BorderID,ClassID,BorderName,ClassName
DimArticleTitle,ArticleAuthor,ArticleFrom,ArticleContent

Url=
List_PageCode=getHTTPPage(Url)
List_PageCode=RegExpText(List_PageCode,打印</th></tr>,</table><tableborder=0cellpadding=5,0)
List_PageCode=RegExpText(List_PageCode,<tdalign=left><ahref=../,><imgborder=0src=../images/authortype0.gif,1)取得当前列表页的文章链接,以,分隔
Array_ArticleID=Split(List_PageCode,,)创建数组,存储文章ID

Fori=0ToUbound(Array_ArticleID)-1
ArticleID=Array_ArticleID(i)文章ID
Content_PageCode=getHTTPPage(取得文章页的内容

=========取文章分类及相关ID参数开始=======================
Content_TempCode=RegExpText(Content_PageCode,<ahref=/article/>技术教程</a>>>,>>内容</td>,0)
Content_CategoryID=RegExpText(Content_PageCode,<ahref=../class,/>,1)
BorderID=Split(Content_CategoryID,,)(0)大类ID
ClassID=Split(Content_CategoryID,,)(1)子类ID
==========检查大类是否存在开始===============
如果不存在则入库

==========检查大类是否存在结束===============
Response.Write(BorderID&,&ClassID&<br/>)
Content_CategoryName=RegExpText(Content_PageCode,/>,</a>,1)
BorderName=Split(Content_CategoryName,,)(0)大类名称
ClassName=Split(Content_CategoryName,,)(1)子类名称
==========检查子类是否存在开始===============
如果不存在则入库

==========检查子类是否存在结束===============
=========取文章分类及相关ID参数结束=======================

=========取文章标题及内容开始=============================
ArticleTitle=RegExpText(Content_PageCode,<tr><tdalign=centerbgcolor=#DEE2F5><strong>,</strong></td></tr>,0)
ArticleAuthor=RegExpText(Content_PageCode,<tr><td><spanclass=blue>作者:</span>,</td></tr>,0)
ArticleFrom=RegExpText(Content_PageCode,<tr><td><spanclass=blue>来源:</span>,</td></tr>,0)
ArticleContent=RegExpText(Content_PageCode,<tr><tdclass=contentstyle=WORD-WRAP:break-wordid=zoom>,</td></tr>&VBCrlf&</table>&VBCrlf&</td></tr></table>,0)
=========取文章标题及内容结束=============================
Response.Write(ArticleTitle&<br/><br/>)
Response.Flush()
Next


附几个函数:
FunctiongetHTTPPage(url)
IF(IsObjInstalled(Microsoft.XMLHTTP)=False)THEN
Response.Write<br><br>服务器不支持Microsoft.XMLHTTP组件
Err.Clear
Response.End
ENDIF
OnErrorResumeNext
Dimhttp
SEThttp=Server.CreateObject(Msxml2.XMLHTTP)
Http.openGET,url,False
Http.send()
IF(Http.readystate<>4)THEN
ExitFunction
ENDIF
getHTTPPage=BytesToBSTR(Http.responseBody,GB2312)
SEThttp=NOTHING
IF(Err.number<>0)THEN
Response.Write<br><br>获取文件内容出错
Response.End
Err.Clear
ENDIF
EndFunction


FunctionBytesToBstr(CodeBody,CodeSet)
DimobjStream
SETobjStream=Server.CreateObject(adodb.stream)
objStream.Type=1
objStream.Mode=3
objStream.Open
objStream.WriteCodeBody
objStream.Position=0
objStream.Type=2
objStream.Charset=CodeSet
BytesToBstr=objStream.ReadText
objStream.Close
SETobjStream=NOTHING
EndFunction

================================================
作用:检查组件是否已经安装
返回值:True----已经安装
False----没有安装
================================================
FunctionIsObjInstalled(objName)
OnErrorResumeNext
IsObjInstalled=False
Err=0
DimtestObj
SETtestObj=Server.CreateObject(objName)
IF(0=Err)THENIsObjInstalled=True
SETtestObj=NOTHING
Err=0
EndFunction

FunctionRegExpText(strng,strStart,strEnd,n)
DimregEx,Match,Matches,RetStr
SETregEx=NewRegExp
regEx.Pattern=strStart&([\s\S]*?)&strEnd
regEx.IgnoreCase=True
regEx.Global=True
SETMatches=regEx.Execute(strng)
ForEachMatchinMatches
IF(n=1)THEN
RetStr=RetStr&regEx.Replace(Match.Value,$1)&,
ELSE
RetStr=RetStr&regEx.Replace(Match.Value,$1)
ENDIF
Next
RegExpText=RetStr
SETregEx=NOTHING
EndFunction


评论


亲,登录后才可以留言!