目前市面上的网页采集器,要么收费,要么有病毒,要么就根本无法抓取复杂的网站,现在我把我公司的采集器源码分享给大家!
编写语言:Microsoft Visual Basic 6.0 中文版
允许环境:win7
采集原理:模拟人工,调用webbrowser 的api进行采集
好处:可以采集任何网站!
基本操作:
1、开启调试
Private Sub Command1_Click()
On Error Resume Next
修改成
Private Sub Command1_Click()
'On Error Resume Next
2、图片保存路径
主程序下的pic文件夹
3、简繁体转换
StoT 简转繁
TtoS 繁转简
4、入口url文件url.txt
5、需要修改的文件:add.html
6、采集原理
WB加载页面,开始采集 wb2 提交到服务器上面
7、数据处理
Function StringWithoutBrackets(ByVal s As String) As String | |
Dim mc As MatchCollection, ma As Match, sV, sA, S_TR As String, nS_TR As String | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<style[^>]*?>[\s\S]*?<\/style>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(s, "") | |
End With | |
If adel.value = 1 Then | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<a[^>]*?>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<\/a>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, ">") | |
End With | |
End If | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<\/p>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<p>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<div[^>]*?none[^>]*?>[\s\S]*?<\/div>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<!--[\s\S]*?-->" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<script[^>]*?>[\s\S]*?<\/script>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "[\n|\r]+" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<p>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<pre>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<br>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<\/li>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<strong>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[#]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<\/strong>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[$]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<h2>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[#]") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<\/h2>" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "[$]") | |
End With | |
If Check2.value = 1 Then '敏感字检测 | |
StringWithoutBrackets = jc_Key(StringWithoutBrackets) | |
End If | |
If upimg.value = 0 Then '删除所有图片? | |
Set mc = RegExpTest("<[^>]+src=""([^""]+)""", StringWithoutBrackets) '通过正则表示式提取图片,务必注意大小写。 | |
If mc.Count > 0 Then | |
For Each ma In mc | |
DoEvents '程序优化 | |
'MsgBox (ma.Value) | |
S_TR = ma.SubMatches(0) | |
If InStr(1, S_TR, "end_news.png") Or InStr(1, S_TR, "/r?") Then | |
Else | |
'StringWithoutBrackets = Replace(StringWithoutBrackets, S_TR, ">[img]" & S_TR & "[/img]<") | |
End If | |
Next | |
End If | |
Else | |
Set mc = RegExpTest("<[^>]+src=""([^""]+)""", StringWithoutBrackets) '通过正则表示式提取图片,务必注意大小写。 | |
If mc.Count > 0 Then | |
For Each ma In mc | |
DoEvents '程序优化 | |
'MsgBox (ma.Value) | |
S_TR = ma.SubMatches(0) | |
If (Left(S_TR, 1) = "/") Then | |
nS_TR = "http://" + WB.document.domain + S_TR | |
Else | |
If (Left(S_TR, 2) <> "ht") Then | |
nS_TR = Mid(WB.LocationURL, 1, InStrRev(WB.LocationURL, "/")) & S_TR | |
Else | |
nS_TR = S_TR | |
End If | |
End If | |
des = GetType(S_TR) | |
FileName = App.Path & "\ypic\" & Int(Rnd * 530000) & upnum & "." & des | |
If URLDownloadToFile(0, nS_TR, FileName, 0, 0) <> 0 Then | |
'Call DownNetFile(downurl, FileName, "") | |
End If | |
If Dir(FileName) = "" Then | |
Else | |
fileSize = FileLen(FileName) | |
If fileSize < 100 Then '已经存在了抛弃大文件 | |
Text1.Text = "文件少于2K抛弃:" & FileName | |
Kill FileName | |
GoTo sup | |
End If | |
dirtxt = LCase(DigestFileToHexStr(FileName)) | |
If Dir(App.Path & "\pic\" & Left(dirtxt, 3), vbDirectory) = "" Then | |
MkDir App.Path & "\pic\" & Left(dirtxt, 3) | |
End If | |
If Dir(App.Path & "\pic\" & dirtxt & "." & des, vbDirectory) = "" Then | |
Name FileName As App.Path & "\pic\" & dirtxt & "." & des | |
Else | |
Text1.Text = "重复:" | |
Kill FileName | |
upnum = upnum + 1 | |
Form1.Caption = "重复" & upnum | |
GoTo sup | |
End If | |
titles = wztitle | |
upimage = "https://i.ysv8.com/upload/" & Replace(dirtxt, "\", "/") & "." & des | |
upnum = upnum + 1 | |
Form1.Caption = upnum | |
sup: | |
StringWithoutBrackets = Replace(StringWithoutBrackets, S_TR, ">[img]" & "https://i.ysv8.com/upload/" & Replace(dirtxt, "\", "/") & "." & des & "[/img]<") | |
End If | |
Next | |
End If | |
End If | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "[<][^>]*[>]" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "") | |
End With | |
StringWithoutBrackets = Replace(StringWithoutBrackets, "[@]", "<br>") | |
StringWithoutBrackets = Replace(StringWithoutBrackets, "[#]", "<strong>") | |
StringWithoutBrackets = Replace(StringWithoutBrackets, "[$]", "</strong>") | |
StringWithoutBrackets = Replace(StringWithoutBrackets, "指导意见", vbLf + "指导意见") | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "<br>[ ]?" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "(<br>)+" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "(<br>)+" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>") | |
End With | |
With CreateObject("VBSCRIPT.REGEXP") | |
.Global = True | |
.Pattern = "(http\:\/\/[a-zA-z0-9.\/\-]*)+" | |
.IgnoreCase = True | |
StringWithoutBrackets = .Replace(StringWithoutBrackets, vbLf) | |
End With | |
StringWithoutBrackets = jc_th(StringWithoutBrackets) | |
'StringWithoutBrackets = jc_jq(StringWithoutBrackets) | |
groupid = 1 | |
'MsgBox (StringWithoutBrackets) | |
End Function |
注意:您最好是具备一定的Visual Basic编程基础,如果您不具备可以去论坛联系我,帮您配置!
(我不一定有时候帮您,您最好是自学下)
源码下载及论坛:
https://www.ysv8.com/f/129.html
源代码免费下载!请点击【采集器免费下载】那几个红字