实例讲解ASP实现抓取网上房产信息

2018-09-05 23:52

阅读:1135

  <%@LANGUAGE=VBSCRIPTCODEPAGE=936%>
<!--#includefile=conn.ASP-->
<!--#includefile=inc/function.asp-->
<!DOCTYPEHTMLPUBLIC-//W3C//DTDHTML4.01Transitional//EN
<html>
<head>
<title>UntitledDocument</title>
<metahttp-equiv=Content-Typecontent=text/html;charset=gb2312>
<metahttp-equiv=refreshcontent=300;URL=steal_house.asp>
</head>

<body>
<%
onerrorresumenext

Server.ScriptTimeout=999999
========================================================
字符编码函数
====================================================
FunctionBytesToBstr(body,code)
dimobjstream
setobjstream=Server.createObject(adodb.stream)
objstream.Type=1
objstream.Mode=3
objstream.Open
objstream.Writebody
objstream.Position=0
objstream.Type=2
objstream.Charset=code
BytesToBstr=objstream.ReadText
objstream.Close
setobjstream=nothing
EndFunction

取行字符串在另一字符串中的出现位置
FunctionNewstring(wstr,strng)
Newstring=Instr(lcase(wstr),lcase(strng))
ifNewstring<=0thenNewstring=Len(wstr)
EndFunction
替换字符串函数
functionReplaceStr(ori,str1,str2)
ReplaceStr=replace(ori,str1,str2)
endfunction
====================================================
functionReadXML(url,code,start,ends)
setoSend=createobject(Microsoft.XMLHTTP)
SourceCode=oSend.open(GET,url,false)
oSend.send()
ReadXml=BytesToBstr(oSend.responseBody,code)
start=Instr(ReadXml,start)
ReadXml=mid(ReadXml,start)
ends=Instr(ReadXml,ends)
ReadXml=left(ReadXml,ends-1)
endfunction

functionSubStr(body,start,ends)
start=Instr(body,start)
SubStr=mid(body,start+len(start)+1)
ends=Instr(SubStr,ends)
SubStr=left(SubStr,ends-1)
endfunction

dimgetcont,NewsContent
dimurl,title
url=新闻网址
getcont=ReadXml(url,gb2312,<tableclass=k2border=0,</table>)
getcont=RegexHtml(getcont)
dimKeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra

dimContactMan,Contact
fori=2toubound(getcont)
response.Write(getcont(i)&__<br>)

tempLink=mid(getcont(i),instr(getcont(i),href=)+6,instr(getcont(i),onClick)-10)
tempLink=replace(tempLink,../,)

response.Write(i&:&tempLink&<br>)
NewsContent=ReadXml(tempLink,gb2312,<tdvalign=bottomwidth=400>,<hrwidth=760noshadesize=1color=#808080>)
NewsContent=RemoveHtml(NewsContent)
NewsContent=replace(NewsContent,VbCrLf,)
NewsContent=replace(NewsContent,vbNewLine,)
NewsContent=replace(NewsContent,,)
NewsContent=replace(NewsContent,,)
NewsContent=replace(NewsContent,,)
NewsContent=replace(NewsContent,\n,)
NewsContent=replace(NewsContent,chr(10),)
NewsContent=replace(NewsContent,chr(13),)
===============getContent=======================
response.Write(NewsContent)
KeyId=SubStr(NewsContent,列号:,信息类别:)
NewsClass=SubStr(NewsContent,类别:,所在城市:)
City=SubStr(NewsContent,城市:,房屋具体位置:)
Position=SubStr(NewsContent,位置:,房屋类型:)
HouseType=SubStr(NewsContent,类型:,楼层:)
Level=SubStr(NewsContent,楼层:,使用面积:)
Area=SubStr(NewsContent,面积:,房价:)
Price=SubStr(NewsContent,房价:,其他说明:)
Demostra=SubStr(NewsContent,说明:,联系人:)
ContactMan=SubStr(NewsContent,联系人:,联系方式:)
Contact=SubStr(NewsContent,联系方式:,信息来源:)
response.Write(总序列号:&KeyId&<br>)
response.Write(信息类别:&NewsClass&<br>)
response.Write(所在城市:&City&<br>)
response.Write(房屋具体位置:&Position&<br>)
response.Write(房屋类型:&HouseType&<br>)
response.Write(楼层:&Level&<br>)
response.Write(使用面积:&Area&<br>)
response.Write(房价:&Price&<br>)
response.Write(其他说明:&Demostra&<br>)
response.Write(联系人:&ContactMan&<br>)
response.Write(联系方式:&Contact&<br>)
title=RemoveHTML(aa(i))
response.Write(title:&title)
if(application.Contents(n)=KeyId)then
ifexit=true
endif
next
ifnotifexitthen
application(time&i)=KeyId
添加到数据库
====================================================
setrs=server.createObject(adodb.recordset)
rs.openselecttop1*fromnewsorderbyiddesc,conn,3,3
rs.addnew
rs(NewsClass)=NewsClass
rs(City)=City
rs(Position)=Position
rs(HouseType)=HouseType
rs(Level)=Level
rs(Area)=Area
rs(Price)=Price
rs(Demostra)=Demostra
rs(ContactMan)=ContactMan
rs(Contact)=Contact
rs.update
rs.close
setrs=nothing
endif
==================================================

next
functionRemoveTag(body)

SetregEx=NewRegExp
regEx.Pattern=<[a].*?<\/[a]>
regEx.IgnoreCase=True
regEx.Global=True
SetMatches=regEx.Execute(body)
dimi,arr(15),ifexit
i=0
j=0
ForEachMatchinMatches
TempStr=Match.Value
TempStr=replace(TempStr,<td>,)
TempStr=replace(TempStr,</td>,)
TempStr=replace(TempStr,<tr>,)
TempStr=replace(TempStr,</tr>,)
arr(i)=TempStr
i=i+1
if(i>=15)then
exitfor
endif
Next
SetregEx=nothing
SetMatches=nothing
RemoveTag=arr

endfunction
functionRegexHtml(body)
dimr_arr(47),r_temp
SetregEx2=NewRegExp
regEx2.Pattern=<a.*?<\/a>
regEx2.IgnoreCase=True
regEx2.Global=True
SetMatches2=regEx2.Execute(body)
iii=0
ForEachMatchinMatches2

r_arr(iii)=Match.Value

iii=iii+1
Next
RegexHtml=r_arr
setregEx2=nothing
setMatches2=nothing
endfunction
======================================================

conn.close
setconn=nothing
%>
</body>
</html>




function.asp

<%
**************************************************
函数名:gotTopic
作用:截字符串,汉字一个算两个字符,英文算一个字符
参数:str----原字符串
strlen----截取长度
返回值:截取后的字符串
**************************************************
functiongotTopic(str,strlen)
ifstr=then
gotTopic=
exitfunction
endif
diml,t,c,i
str=replace(replace(replace(replace(str,,),",chr(34)),>,>),<,<)
str=replace(str,?,)
l=len(str)
t=0
fori=1tol
c=Abs(Asc(Mid(str,i,1)))
ifc>255then
t=t+2
else
t=t+1
endif
ift>=strlenthen
gotTopic=left(str,i)&…
exitfor
else
gotTopic=str
endif
next
gotTopic=replace(replace(replace(replace(gotTopic,,),chr(34),"),>,>),<,<)
endfunction
=========================================================
函数:RemoveHTML(strHTML)
功能:去除HTML标记
参数:strHTML--要去除HTML标记的字符串
=========================================================
FunctionRemoveHTML(strHTML)
DimobjRegExp,Match,Matches
SetobjRegExp=NewRegexp

objRegExp.IgnoreCase=True
objRegExp.Global=True
取闭合的<>
objRegExp.Pattern=<.+?>
进行匹配
SetMatches=objRegExp.Execute(strHTML)

遍历匹配集合,并替换掉匹配的项目
ForEachMatchinMatches
strHtml=Replace(strHTML,Match.Value,)
Next
RemoveHTML=strHTML
SetobjRegExp=Nothing
setMatches=nothing
EndFunction

%>



conn.asp

<%
onerrorresumenext
setconn=server.createObject(adodb.connection)
con=driver={MicrosoftAccessDriver(*.mdb)};dbq=&Server.MapPath(stest.mdb)
conn.opencon

subconnclose
conn.close
setconn=nothing
endsub
%>


评论


亲,登录后才可以留言!