自己做采集程序

2018-09-06 10:29

阅读:783

  现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要是去分析源网站的网页结构。
首先去下载个XMLHTTP的类文件:
<%
Class xhttp
private cset,sUrl,sError
Private Sub Class_Initialize()
cset=UTF-8
cset=GB2312
sError=
end sub

  Private Sub Class_Terminate()
End Sub

  Public Property LET URL(theurl)
sUrl=theurl
end property
public property GET BasePath()
BasePath=mid(sUrl,1,InStrRev(sUrl,/)-1)
end property
public property GET FileName()
FileName=mid(sUrl,InStrRev(sUrl,/)+1)
end property
public property GET Html()
Html=BytesToBstr(getBody(sUrl))
end property

  public property GET xhttpError()
xhttpError=sError
end property

  private Function BytesToBstr(body)
on error resume next
Cset:GB2312 UTF-8
dim objstream
set objstream = Server.CreateObject(adodb.stream)
with objstream
.Type = 1
.Mode = 3
.Open
.Write body
.Position = 0
.Type = 2
.Charset = Cset
BytesToBstr = .ReadText
.Close
end with
set objstream = nothing
End Function

  private function getBody(surl)
on error resume next
dim xmlHttp
Set xmlHttp=server.createobject(Msxml2.XMLHTTP.4.0)
set xmlHttp=server.createobject(Microsoft.XMLHTTP)
set xmlHttp=server.createobject(MSXML2.ServerXMLHTTP)
xmlHttp.setTimeouts 10000,10000,10000,30000
xmlHttp.open GET,surl,false
xmlHttp.send
if xmlHttp.readystate=4 then
if xmlHttp.status=200 then
getBody=xmlhttp.responsebody
end if
else
getBody=
end if

  if Err.Number<>0 then
sError=Err.Number
Err.clear
else
sError=
end if
set xmlHttp=nothing
end function

  Public function saveimage(tofile,isoverwrite)
on error resume next
dim objStream,objFSO,imgs

  if Not isoverwrite Then
Set objFSO = Server.CreateObject(Scripting.FileSystemObject)
If objFSO.FileExists(Server.MapPath(tofile)) Then
Exit Function
End If
Set objFSO = Nothing
End IF

  imgs=getBody(sUrl)
Set objStream = Server.CreateObject(ADODB.Stream)
with objStream
.Type =1
.Open
.write imgs
.SaveToFile server.mappath(tofile),2
.Close()
end with
set objstream=nothing
end function

  end class

  %>
用了这个类文件,做起事情来就方便多了。
然后就可以分析采集网站的网页结构,写采集程序了。
下面给个例子:
<!--#include file=conn.asp-->
<!--#include file=inc/xhttp_class.asp-->
<!--#include file=inc/function.asp-->
<%
server.ScriptTimeout = 1000
%>
<html>
<head>
<meta http-equiv=Content-Type content=text/html; charset=gb2312 />
<title>BT采集器</title>
</head>
<body>
<form name=form1 method=post action=get81bt.asp>
分类ID:
<input type=text name=cid value=<%=request(cid)%>><br>
开始ID:
<input type=text name=startid value=<%=request(startid)%>>
<br>
结束ID:
<input type=text name=overid value=<%=request(overid)%>>
<br>
分类名称:<input type=text name=classname value=<%=request(classname)%>>为空自动获取
<br>
<input name=action type=hidden id=action value=getdata>
<input type=submit name=Submit value=采集>
</form>
当前ID:<%=request(id)%> <br>
<%
dim action

action = Request(action)
if action = getdata then
cid = Request(cid)
startid = Request(startid)
overid = Request(overid)
id = Request(id)
if id = then id = startid

set objxhttp = new xhttp

objxhttp.URL =
content = objxhttp.Html

if InStr(content,网站维护中) then
call NextID
response.End()
end if

list = GetContent(content,<!--内容开始-->,<!--内容结束-->,0)

Dim regEx, Match, Matches,patrn
Set regEx = New RegExp
patrn = <a href=../BtHtml/(.+?)>
regEx.Pattern = patrn
regEx.IgnoreCase = True
regEx.Global = True
Set Matches = regEx.Execute(list)
on error resume next
For Each Match in Matches

response.write Match.Value & <br>
weburl =
response.write weburl & <br>
response.Flush()

objxhttp.URL = weburl
cpage = objxhttp.Html
cpage = GetContent(cpage,<!--内容开始-->,<!--内容结束-->,0)

title = GetContent(cpage,BT资源名称:<strong>,</strong>,0)
title = stripHTML(title)

IF Request(classname) <> then
classname = Request(classname)
Else
if InStr(title,喜剧) then
classname = 喜剧
Elseif InStr(title,动作) then
classname = 动作
Elseif InStr(title,惊悚) then
classname = 惊悚
Elseif InStr(title,犯罪) then
classname = 犯罪
Elseif InStr(title,恐怖) then
classname = 恐怖
Elseif InStr(title,爱情) then
classname = 爱情
Elseif InStr(title,冒险) then
classname = 冒险
Elseif InStr(title,科幻) then
classname = 科幻
Elseif InStr(title,悬念) then
classname = 悬念
Elseif InStr(title,奇幻) then
classname = 奇幻
Elseif InStr(title,战争) then
classname = 战争
Elseif InStr(title,连续剧) then
classname = 连续剧
Elseif InStr(title,综艺) then
classname = 综艺
Elseif InStr(title,灾难) then
classname = 灾难
Elseif InStr(title,伦理) then
classname = 伦理
Elseif InStr(title,动漫) or InStr(title,动画) then
classname = 动漫
Elseif InStr(title,国语) or InStr(title,集) then
classname = 其他影视
Else
classname = 其他
End if
End IF

intro = GetContent(cpage,<tr><td width=770 bgcolor=#FFFFFF><div style=margin:10px;line-height:150%>,</div>,0)
intro = Replace(intro,<br />,[br])
intro = Replace(intro,<BR />,[br])
intro = Replace(intro,<BR>,[br])
intro = Replace(intro,<br>,[br])
intro = Replace(intro,<p>,[p])
intro = Replace(intro,<P>,[p])
intro = Replace(intro,</p>,[/p])
intro = Replace(intro,</P>,[p])
intro = Replace(intro,<img,[img)
intro = Replace(intro,<IMG,[img)
intro = stripHTML(intro)
intro = Replace(intro,[br],<br>)
intro = Replace(intro,[p],<p>)
intro = Replace(intro,[/p],</p>)
intro = Replace(intro,[img,<img)
intro = Replace(intro,[img],<img src=)
intro = Replace(intro,[/img],>)
intro = Replace(intro,[IMG],<img src=)
intro = Replace(intro,[/IMG],>)
response.write t
response.End()

addtime = Trim(GetContent(cpage,发布时间:,,0))
if Not IsDate(addtime) then addtime = now()

username = bt

filesize = GetContent(content,BT文件大小:,,0)

title2 = title

downurl = GetContent(cpage,<a style=color:red href=,,0)

p = CDate(addtime)
Dim sRnd
Randomize
sRnd = Int(900 * Rnd) + 100
sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & .torrent

url = torrent/ & year(p) & - & month(p) & - & day(p) & / & sFileName
Call CreateF(url)

Text
Response.Write classname & <br>
Response.write title & <br>
response.Write intro & <br>
response.Write addtime & <br>
response.Write username & <br>
response.Write filesize & <br>
response.Write downurl & <br>
response.Write url & <br>
response.Flush()

response.End()
database

if err.number = 0 then
if (Not IsNull(title)) and title <> and downurl <> then
set rs = server.CreateObject(adodb.recordset)
sql = select * from bt_class where classname = & classname &
rs.open sql,conn,1,3
if rs.eof then
rs.addnew
rs(classname) = classname
rs.update
end if
classid = rs(classid)
rs.close
set rs = nothing

set rs = server.CreateObject(adodb.recordset)
sql = select * from bt_movie where title in ( & title & )
rs.open sql,conn,1,3
if rs.eof then
response.Write <div><font color=blue>写入数据库...</font></div>
response.Flush()
rs.addnew
rs(classid) = classid
rs(title) = title
rs(title2) = title2
rs(intro) = intro
rs(username) = username
rs(filesize) = filesize
rs(url) = url
rs(serverid) = 1
rs(addtime) = addtime
rs(ismake) = 0
rs.update

objxhttp.URL = downurl
objxhttp.saveimage url,False
else
response.Write <div><font color=red>已经存在!</font></div>
end if
rs.close
set rs = nothing

objxhttp.URL = downurl
objxhttp.saveimage url,False
End IF

Else
err.clear
End IF
response.Write -------------------------------------------<br>
Next
set regEx = nothing


response.Write 下一页<br>
response.Flush()

Call NextID()

end if

Sub NextID
conn.close
set conn = nothing

if cint(startid) < cint(overid) and cint(id) < cint(overid) then
response.Write <script>location.href=get81bt.asp?action=getdata&classname= & Request(classname) & &cid= & cid & &startid= & startid & &overid= & overid & &id=& id + 1 &</script>
Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then
response.Write <script>location.href=get81bt.asp?action=getdata&classname= & Request(classname) & &cid= & cid & &startid= & startid & &overid= & overid & &id=& id - 1 &</script>
Else
Response.Write 采集完成!<br>
response.End()
End if
End Sub

%>

</body>
</html>


评论


亲,登录后才可以留言!