Asp 使用 Microsoft.XMLHTTP 抓取网页内容并过滤需要的

2018-09-06 11:45

阅读：502

　　Asp 使用 Microsoft.XMLHTTP 抓取网页内容(没用乱码)，并过滤需要的内容

示例源码：
复制代码代码如下:
<%
Dim xmlUrl,http,strHTML,strBody
xmlUrl = Request.QueryString(u)

REM 异步读取XML源
Set http = server.CreateObject(Microsoft.XMLHTTP)
http.Open POST,xmlUrl,false
http.setrequestheader User-Agent, Mozilla/4.0
http.setrequestheader Connection, Keep-Alive
http.setRequestHeader Content-Type, application/x-
http.Send()

strHTML = BytesToBstr(http.ResponseBody)
set http = nothing

REM 抓取主要内容
strBody = GetBody(strHTML,<div id=Div_newsContentc class=cnt>,</div>,0,0)
strBody =Replace(strBody,（本文首发于,)
strBody =Replace(strBody,财富动力网</a>，转载请注明出处。）,)
strBody =Replace(strBody,本文首发于，转载请注明出处。）,)
strBody =Replace(strBody,财富动力网</a>:
strBody =Replace(strBody,本文首发于,)

Response.Write RegRemoveHref(strBody)

REM 获取对应网址响应的HTML
Function BytesToBstr(body)
dim objstream
set objstream = Server.CreateObject(adodb.stream)
objstream.Type = 1
objstream.Mode =3
objstream.Open
objstream.Write body
objstream.Position = 0
objstream.Type = 2
objstream.Charset = UTF-8

转换原来默认的UTF-8编码转换成GB2312编码，否则直接用
XMLHTTP调用有中文字符的网页得到的将是乱码
BytesToBstr = objstream.ReadText
objstream.Close
set objstream = nothing
End Function

REM 使用正则表达式，抓取之内标记的内容
Function GetBody(ConStr,StartStr,OverStr,IncluL,IncluR)
If ConStr=$False$ or ConStr= or IsNull(ConStr)=True Or StartStr= or IsNull(StartStr)=True Or OverStr= or IsNull(OverStr)=True Then
GetBody=$False$
Exit Function
End If
Dim ConStrTemp
Dim Start,Over
ConStrTemp=Lcase(ConStr)
StartStr=Lcase(StartStr)
OverStr=Lcase(OverStr)
Start = InStrB(1, ConStrTemp, StartStr, vbBinaryCompare)
If Start<=0 then
GetBody=$False$
Exit Function
Else
If IncluL=False Then
Start=Start+LenB(StartStr)
End If
End If
Over=InStrB(Start,ConStrTemp,OverStr,vbBinaryCompare)
If Over<=0 Or Over<=Start then
GetBody=$False$
Exit Function
Else
If IncluR=True Then
Over=Over+LenB(OverStr)
End If
End If
GetBody=MidB(ConStr,Start,Over-Start)
End Function

REM 过滤a超链接
Function RegRemoveHref(HTMLstr)
Set ra = New RegExp
ra.IgnoreCase = True
ra.Global = True
ra.Pattern = <a[^>]+>(.+?)<\/a>

RegRemoveHref = Replace(ra.replace(HTMLstr,$1),href=
END Function
%>

效果图如下：