目前市面上的网页采集器,要么收费,要么有病毒,要么就根本无法抓取复杂的网站,现在我把我公司的采集器源码分享给大家!
编写语言:Microsoft Visual Basic 6.0 中文版
允许环境:win7
采集原理:模拟人工,调用webbrowser 的api进行采集
好处:可以采集任何网站!
基本操作:
1、开启调试
Private Sub Command1_Click()
On Error Resume Next
修改成
Private Sub Command1_Click()
'On Error Resume Next
2、图片保存路径
主程序下的pic文件夹
3、简繁体转换
StoT 简转繁
TtoS 繁转简
4、入口url文件url.txt
5、需要修改的文件:add.html
6、采集原理
WB加载页面,开始采集 wb2 提交到服务器上面
7、数据处理
Function StringWithoutBrackets(ByVal s As String) As String
Dim mc As MatchCollection, ma As Match, sV, sA, S_TR As String, nS_TR As String
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<style[^>]*?>[\s\S]*?<\/style>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(s, "")
End With
If adel.value = 1 Then
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<a[^>]*?>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<\/a>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, ">")
End With
End If
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<\/p>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<p>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<div[^>]*?none[^>]*?>[\s\S]*?<\/div>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<!--[\s\S]*?-->"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<script[^>]*?>[\s\S]*?<\/script>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "[\n|\r]+"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<p>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<pre>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<br>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<\/li>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<strong>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[#]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<\/strong>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[$]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<h2>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[#]")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<\/h2>"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[$]")
End With
If Check2.value = 1 Then '敏感字检测
StringWithoutBrackets = jc_Key(StringWithoutBrackets)
End If
If upimg.value = 0 Then '删除所有图片?
Set mc = RegExpTest("<[^>]+src=""([^""]+)""", StringWithoutBrackets) '通过正则表示式提取图片,务必注意大小写。
If mc.Count > 0 Then
For Each ma In mc
DoEvents '程序优化
'MsgBox (ma.Value)
S_TR = ma.SubMatches(0)
If InStr(1, S_TR, "end_news.png") Or InStr(1, S_TR, "/r?") Then
Else
'StringWithoutBrackets = Replace(StringWithoutBrackets, S_TR, ">[img]" & S_TR & "[/img]<")
End If
Next
End If
Else
Set mc = RegExpTest("<[^>]+src=""([^""]+)""", StringWithoutBrackets) '通过正则表示式提取图片,务必注意大小写。
If mc.Count > 0 Then
For Each ma In mc
DoEvents '程序优化
'MsgBox (ma.Value)
S_TR = ma.SubMatches(0)
If (Left(S_TR, 1) = "/") Then
nS_TR = "http://" + WB.document.domain + S_TR
Else
If (Left(S_TR, 2) <> "ht") Then
nS_TR = Mid(WB.LocationURL, 1, InStrRev(WB.LocationURL, "/")) & S_TR
Else
nS_TR = S_TR
End If
End If
des = GetType(S_TR)
FileName = App.Path & "\ypic\" & Int(Rnd * 530000) & upnum & "." & des
If URLDownloadToFile(0, nS_TR, FileName, 0, 0) <> 0 Then
'Call DownNetFile(downurl, FileName, "")
End If
If Dir(FileName) = "" Then
Else
fileSize = FileLen(FileName)
If fileSize < 100 Then '已经存在了抛弃大文件
Text1.Text = "文件少于2K抛弃:" & FileName
Kill FileName
GoTo sup
End If
dirtxt = LCase(DigestFileToHexStr(FileName))
If Dir(App.Path & "\pic\" & Left(dirtxt, 3), vbDirectory) = "" Then
MkDir App.Path & "\pic\" & Left(dirtxt, 3)
End If
If Dir(App.Path & "\pic\" & dirtxt & "." & des, vbDirectory) = "" Then
Name FileName As App.Path & "\pic\" & dirtxt & "." & des
Else
Text1.Text = "重复:"
Kill FileName
upnum = upnum + 1
Form1.Caption = "重复" & upnum
GoTo sup
End If
titles = wztitle
upimage = "https://i.ysv8.com/upload/" & Replace(dirtxt, "\", "/") & "." & des
upnum = upnum + 1
Form1.Caption = upnum
sup:
StringWithoutBrackets = Replace(StringWithoutBrackets, S_TR, ">[img]" & "https://i.ysv8.com/upload/" & Replace(dirtxt, "\", "/") & "." & des & "[/img]<")
End If
Next
End If
End If
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "[<][^>]*[>]"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "")
End With
StringWithoutBrackets = Replace(StringWithoutBrackets, "[@]", "<br>")
StringWithoutBrackets = Replace(StringWithoutBrackets, "[#]", "<strong>")
StringWithoutBrackets = Replace(StringWithoutBrackets, "[$]", "</strong>")
StringWithoutBrackets = Replace(StringWithoutBrackets, "指导意见", vbLf + "指导意见")
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "<br>[ ]?"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "(<br>)+"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "(<br>)+"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>")
End With
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "(http\:\/\/[a-zA-z0-9.\/\-]*)+"
.IgnoreCase = True
StringWithoutBrackets = .Replace(StringWithoutBrackets, vbLf)
End With
StringWithoutBrackets = jc_th(StringWithoutBrackets)
'StringWithoutBrackets = jc_jq(StringWithoutBrackets)
groupid = 1
'MsgBox (StringWithoutBrackets)
End Function
注意:您最好是具备一定的Visual Basic编程基础,如果您不具备可以去论坛联系我,帮您配置!
(我不一定有时候帮您,您最好是自学下)
源码下载及论坛:
https://www.ysv8.com/f/129.html
源代码免费下载!请点击【采集器免费下载】那几个红字