asp采集抓取网上房产信息的代码
2018-09-06 12:35
复制代码 代码如下:
<%@LANGUAGE=VBSCRIPTCODEPAGE=936%>
<!--#includefile=conn.asp-->
<!--#includefile=inc/function.asp-->
<!DOCTYPEHTMLPUBLIC-//W3C//DTDHTML4.01Transitional//EN
<html>
<head>
<title>UntitledDocument</title>
<metahttp-equiv=Content-Typecontent=text/html;charset=gb2312>
<metahttp-equiv=refreshcontent=300;URL=steal_house.asp>
</head>
<body>
<%
onerrorresumenext
Server.ScriptTimeout=999999
========================================================
字符编码函数
====================================================
FunctionBytesToBstr(body,code)
dimobjstream
setobjstream=Server.CreateObject(adodb.stream)
objstream.Type=1
objstream.Mode=3
objstream.Open
objstream.Writebody
objstream.Position=0
objstream.Type=2
objstream.Charset=code
BytesToBstr=objstream.ReadText
objstream.Close
setobjstream=nothing
EndFunction
取行字符串在另一字符串中的出现位置
FunctionNewstring(wstr,strng)
Newstring=Instr(lcase(wstr),lcase(strng))
ifNewstring<=0thenNewstring=Len(wstr)
EndFunction
替换字符串函数
functionReplaceStr(ori,str1,str2)
ReplaceStr=replace(ori,str1,str2)
endfunction
====================================================
functionReadXml(url,code,start,ends)
setoSend=createobject(Microsoft.XMLHTTP)
SourceCode=oSend.open(GET,url,false)
oSend.send()
ReadXml=BytesToBstr(oSend.responseBody,code)
start=Instr(ReadXml,start)
ReadXml=mid(ReadXml,start)
ends=Instr(ReadXml,ends)
ReadXml=left(ReadXml,ends-1)
endfunction
functionSubStr(body,start,ends)
start=Instr(body,start)
SubStr=mid(body,start+len(start)+1)
ends=Instr(SubStr,ends)
SubStr=left(SubStr,ends-1)
endfunction
dimgetcont,NewsContent
dimurl,title
url=新闻网址
getcont=ReadXml(url,gb2312,<tableclass=k2border=0,</table>)
getcont=RegexHtml(getcont)
dimKeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra
dimContactMan,Contact
fori=2toubound(getcont)
response.Write(getcont(i)&__<br>)
tempLink=mid(getcont(i),instr(getcont(i),href=)+6,instr(getcont(i),
onClick)-10)
tempLink=replace(tempLink,../,)
response.Write(i&:&tempLink&<br>)
NewsContent=ReadXml(tempLink,gb2312,<tdvalign=bottom
width=400>,<hrwidth=760
noshadesize=1color=#808080>
)
NewsContent=RemoveHtml(NewsContent)
NewsContent=replace(NewsContent,VbCrLf,)
NewsContent=replace(NewsContent,vbNewLine,)
NewsContent=replace(NewsContent,,)
NewsContent=replace(NewsContent,,)
NewsContent=replace(NewsContent,,)
NewsContent=replace(NewsContent,\n,)
NewsContent=replace(NewsContent,chr(10),)
NewsContent=replace(NewsContent,chr(13),)
===============getContent=======================
response.Write(NewsContent)
KeyId=SubStr(NewsContent,列号:,信息类别:)
NewsClass=SubStr(NewsContent,类别:,所在城市:)
City=SubStr(NewsContent,城市:,房屋具体位置:)
Position=SubStr(NewsContent,位置:,房屋类型:)
HouseType=SubStr(NewsContent,类型:,楼层:)
Level=SubStr(NewsContent,楼层:,使用面积:)
Area=SubStr(NewsContent,面积:,房价:)
Price=SubStr(NewsContent,房价:,其他说明:)
Demostra=SubStr(NewsContent,说明:,联系人:)
ContactMan=SubStr(NewsContent,联系人:,联系方式:)
Contact=SubStr(NewsContent,联系方式:,信息来源:)
response.Write(总序列号:&KeyId&<br>)
response.Write(信息类别:&NewsClass&<br>)
response.Write(所在城市:&City&<br>)
response.Write(房屋具体位置:&Position&<br>)
response.Write(房屋类型:&HouseType&<br>)
response.Write(楼层:&Level&<br>)
response.Write(使用面积:&Area&<br>)
response.Write(房价:&Price&<br>)
response.Write(其他说明:&Demostra&<br>)
response.Write(联系人:&ContactMan&<br>)
response.Write(联系方式:&Contact&<br>)
title=RemoveHTML(aa(i))
response.Write(title:&title)
if(application.Contents(n)=KeyId)then
ifexit=true
endif
next
ifnotifexitthen
application(time&i)=KeyId
添加到数据库
====================================================
setrs=server.CreateObject(adodb.recordset)
rs.openselecttop1*fromnewsorderbyiddesc,conn,3,3
rs.addnew
rs(NewsClass)=NewsClass
rs(City)=City
rs(Position)=Position
rs(HouseType)=HouseType
rs(Level)=Level
rs(Area)=Area
rs(Price)=Price
rs(Demostra)=Demostra
rs(ContactMan)=ContactMan
rs(Contact)=Contact
rs.update
rs.close
setrs=nothing
endif
==================================================
next
functionRemoveTag(body)
SetregEx=NewRegExp
regEx.Pattern=<[a].*?<\/[a]>
regEx.IgnoreCase=True
regEx.Global=True
SetMatches=regEx.Execute(body)
dimi,arr(15),ifexit
i=0
j=0
ForEachMatchinMatches
TempStr=Match.Value
TempStr=replace(TempStr,<td>,)
TempStr=replace(TempStr,</td>,)
TempStr=replace(TempStr,<tr>,)
TempStr=replace(TempStr,</tr>,)
arr(i)=TempStr
i=i+1
if(i>=15)then
exitfor
endif
Next
SetregEx=nothing
SetMatches=nothing
RemoveTag=arr
endfunction
functionRegexHtml(body)
dimr_arr(47),r_temp
SetregEx2=NewRegExp
regEx2.Pattern=<a.*?<\/a>
regEx2.IgnoreCase=True
regEx2.Global=True
SetMatches2=regEx2.Execute(body)
iii=0
ForEachMatchinMatches2
r_arr(iii)=Match.Value
iii=iii+1
Next
RegexHtml=r_arr
setregEx2=nothing
setMatches2=nothing
endfunction
======================================================
conn.close
setconn=nothing
%>
</body>
</html>