Behance 网站是设计师灵感必备网站,想要设计作品必先学会借鉴/抄袭/白嫖,可惜这个网站需要访问国外网站才能访问,对于国人不甚友好,甚至还出现了删号,渣渣狗得很!
这个网站跟国内的网站,花瓣网很像,甚至可以说花瓣学习了它不少,在瀑布流网页的展示上也有很多相似之处。
前面本渣渣就分享过花瓣网图片采集爬虫,感兴趣可以移步查看,现在还能用!
注:Behance 作品图片及内容采集 需配合访问国外网站的工具使用,前面本渣渣也有分享相关签到工具的源码脚本,尤其适合像本渣渣一样菜的白嫖党使用。
附上几个关键点,以供参考!
headers 协议头
网站反爬的基础方式之一,也是根本,那就是协议头,现在不少网站除了ua,还需要获取到cookies,Behance 网站也不例外,两个要素缺一不可,否则访问会返回不支持机器访问。
headers={ | |
"cookie":cookie , | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", | |
} |
访问超时重试机制
由于使用的是访问国外网站的工具,而且是白嫖的免费工具,同时访问的是外网,因此在网页访问及图片下载请求的同时都会存在访问超时的情况,这个时候就需要对于访问失败或者超时错误进行重试处理。
- 设置访问超时10s,以免程序卡死
response = requests.get(url=url, headers=headers, timeout=10)
- 访问超时或者失败重试机制,这里重试两次,然后直接跳过
def get_response(url, headers): | |
i = 0 | |
while i < 3: | |
try: | |
response = requests.get(url, headers=headers, timeout=8) | |
return response | |
except requests.exceptions.RequestException: | |
i += 1 | |
print(f">> 获取网页出错,6S后将重试获取第:{i} 次") | |
time.sleep(6) |
多线程的使用
既然是图片的采集,那么效率方面一定要兼顾,这方面多线程的使用也是标配了,这里就搬出了本渣渣祖传的多线程代码以供参考,别问为什么这么写,直接照抄就是了,因为本渣渣也是抄的。
- 多线程下载图片版本一 threading(推荐)
#多线程下载图片方式一 | |
import threading | |
def down_imgs(imgs,path,url): | |
threadings = [] | |
for img in imgs: | |
t = threading.Thread(target=tps, args=(path,img,url)) | |
threadings.append(t) | |
t.start() | |
for x in threadings: | |
x.join() | |
print(f"恭喜,多线程下载图片完成!") |
- 多线程下载图片版本二 Pool
#多线程下载图片方式二 | |
from multiprocessing.dummy import Pool as ThreadPool | |
def get_imgs(datas): | |
try: | |
# 开4个 worker,没有参数时默认是 cpu 的核心数 | |
pool = ThreadPool() | |
results = pool.map(tpss, datas) | |
pool.close() | |
pool.join() | |
except: | |
print("Error: unable to start thread") |
其他方面就没有什么写的,都是基操,有手就行(其实字数篇幅到了),就这样吧!
如对本渣渣狗屎一样的源码感兴趣,可拉至文末,屎山供参考!考虑到大部分老哥老妹对于源码不感兴趣,这里附上exe采集工具以供尝试,仅支持单篇作品文章采集,采集完成效果,包括图片及文字信息。
提供两个 Behance 作品采集工具,一个单线程下载图片版本,一个多线程下载图片版本V2.0!
采集过程演示:
采集效果:
附工具使用说明:
Behance 作品采集器-
工具用途:Behance 作品单篇采集,可采集图片及文字内容信息;
注意:需配合访问国外网站
工具使用。
工具保证免费无毒,首发唯一来源:
工具为Python编写,
编写环境为Win7 64位,推荐该环境下使用, 其他系统环境不保证兼容。
附上完整源码供参考,cookie需自行补充填写!
#Behance网站作品采集 | |
#https://www.behance.net/search/projects?field=industrial+design | |
# -*- coding: UTF-8 -*- | |
#@author:huguo00289 | |
import requests | |
import time | |
from lxml import etree | |
import re | |
import os | |
import random | |
import threading | |
from multiprocessing.dummy import Pool as ThreadPool | |
#单线程下图片 | |
def get_detail(url): | |
headers={ | |
"cookie":cookie , | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", | |
} | |
response = requests.get(url=url, headers=headers, timeout=10) | |
print(response.status_code) | |
html = response.content.decode("utf-8") | |
#print(html) | |
get_html(html, url) | |
#多线程下图片 | |
def get_sdetail(url): | |
headers={ | |
"cookie": cookie, | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", | |
} | |
response = requests.get(url=url, headers=headers, timeout=10) | |
print(response.status_code) | |
html = response.content.decode("utf-8") | |
#print(html) | |
get_shtml(html, url) | |
#多线程下图片方式二 | |
def get_ssdetail(url): | |
headers={ | |
"cookie": cookie, | |
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", | |
} | |
response = requests.get(url=url, headers=headers, timeout=10) | |
print(response.status_code) | |
html = response.content.decode("utf-8") | |
get_sshtml(html,url) | |
def tp(path,img_url,url): | |
ua_list = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', | |
] | |
headers = { | |
'Connection': 'close', | |
'referer': url, | |
"User-Agent": random.choice(ua_list) | |
} | |
img_name=img_url.split('/')[-1] | |
r = requests.get(img_url,headers=headers,timeout=10) | |
time.sleep(1) | |
with open(f'{path}{img_name}','wb')as f: | |
f.write(r.content) | |
print(f">> {img_name}下载图片成功") | |
# 3次重试 | |
def get_response(url, headers): | |
i = 0 | |
while i < 4: | |
try: | |
response = requests.get(url, headers=headers, timeout=10) | |
return response | |
except requests.exceptions.RequestException: | |
i += 1 | |
print(f">> 获取网页出错,6S后将重试获取第:{i} 次") | |
time.sleep(i*2) | |
def tps(path,img_url,url): | |
ua_list = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', | |
] | |
headers = { | |
'Connection': 'close', | |
'referer': url, | |
"User-Agent": random.choice(ua_list) | |
} | |
img_name=img_url.split('/')[-1] | |
r = get_response(img_url, headers) | |
time.sleep(1) | |
with open(f'{path}{img_name}','wb')as f: | |
f.write(r.content) | |
print(f">> {img_name}下载图片成功") | |
#解析页面下图单线程 | |
def get_html(html,url): | |
tree=etree.HTML(html) | |
h2=tree.xpath('//title/text()')[0] | |
h2 = h2.strip() | |
h2 = h2.replace('|', '') | |
pattern = r"[\/\\\:\*\?\"\<\>\|]" | |
h2 = re.sub(pattern, "_", h2) # 替换为下划线 | |
print(">> 获取网页标题成功,标题为:") | |
print(h2) | |
path = f'{h2}/' | |
os.makedirs(path, exist_ok=True) | |
main_text = tree.xpath('//div[@class="main-text"]//span/text()') | |
text = '\n'.join(main_text) | |
text = f'{h2}\n\n{text}' | |
with open(f'{path}{h2}.txt', 'w', encoding='utf-8') as f: | |
f.write(text) | |
print(">> 恭喜,保存作品内容成功!") | |
print(f"作品内容保存为:{h2}.txt !") | |
imgs=tree.xpath('//div[@class="ImageElement-root-kir"]/img/@src') | |
print(">> 获取图片成功,图片列表为:") | |
print(imgs) | |
for img in imgs: | |
try: | |
tp(path,img, url) | |
except Exception as e: | |
print(f"下载 {img} 图片失败,错误代码:{e}!") | |
print(">> 4s后尝试重新下载图片...") | |
time.sleep(4) | |
try: | |
tp(path, img, url) | |
except Exception as e: | |
print(f"下载 {img} 图片失败,错误代码:{e}!") | |
time.sleep(6) | |
print(">> 6s后尝试重新下载图片...") | |
try: | |
tp(path, img, url) | |
except Exception as e: | |
print(f"下载 {img} 图片失败,错误代码:{e}!") | |
time.sleep(4) | |
print(f"下载 {img} 图片失败,已跳过该图片下载!") | |
pass | |
time.sleep(5) | |
#多线程下载图片方式一 | |
def down_imgs(imgs,path,url): | |
threadings = [] | |
for img in imgs: | |
t = threading.Thread(target=tps, args=(path,img,url)) | |
threadings.append(t) | |
t.start() | |
for x in threadings: | |
x.join() | |
print(f"恭喜,多线程下载图片完成!") | |
def tpss(data): | |
path=data[0] | |
img_url=data[1] | |
url=data[2] | |
ua_list = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', | |
] | |
headers = { | |
'Connection': 'close', | |
'referer': url, | |
"User-Agent": random.choice(ua_list) | |
} | |
img_name=img_url.split('/')[-1] | |
r = get_response(img_url, headers) | |
time.sleep(1) | |
with open(f'{path}{img_name}','wb')as f: | |
f.write(r.content) | |
print(f">> {img_name}下载图片成功") | |
#多线程下载图片方式二 | |
def get_imgs(datas): | |
try: | |
# 开4个 worker,没有参数时默认是 cpu 的核心数 | |
pool = ThreadPool() | |
results = pool.map(tpss, datas) | |
pool.close() | |
pool.join() | |
except: | |
print("Error: unable to start thread") | |
#解析页面下图多线程方式二 | |
def get_sshtml(html,url): | |
tree=etree.HTML(html) | |
h2=tree.xpath('//title/text()')[0] | |
h2 = h2.strip() | |
h2 = h2.replace('|', '') | |
pattern = r"[\/\\\:\*\?\"\<\>\|]" | |
h2 = re.sub(pattern, "_", h2) # 替换为下划线 | |
print(">> 获取网页标题成功,标题为:") | |
print(h2) | |
path = f'{h2}/' | |
os.makedirs(path, exist_ok=True) | |
main_text = tree.xpath('//div[@class="main-text"]//span/text()') | |
text = '\n'.join(main_text) | |
text = f'{h2}\n\n{text}' | |
with open(f'{path}{h2}.txt', 'w', encoding='utf-8') as f: | |
f.write(text) | |
print(">> 恭喜,保存作品内容成功!") | |
print(f"作品内容保存为:{h2}.txt !") | |
imgs=tree.xpath('//div[@class="ImageElement-root-kir"]/img/@src') | |
print(">> 获取图片成功,图片列表为:") | |
print(imgs) | |
datas=[] | |
for img in imgs: | |
data=path,img,url | |
datas.append(data) | |
print(">> 正在多线程下载图片,请稍候...") | |
get_imgs(datas) | |
time.sleep(5) | |
#解析页面下图多线程 | |
def get_shtml(html,url): | |
tree=etree.HTML(html) | |
h2=tree.xpath('//title/text()')[0] | |
h2 = h2.strip() | |
h2 = h2.replace('|', '') | |
pattern = r"[\/\\\:\*\?\"\<\>\|]" | |
h2 = re.sub(pattern, "_", h2) # 替换为下划线 | |
print(">> 获取网页标题成功,标题为:") | |
print(h2) | |
path = f'{h2}/' | |
os.makedirs(path, exist_ok=True) | |
main_text = tree.xpath('//div[@class="main-text"]//span/text()') | |
text = '\n'.join(main_text) | |
text = f'{h2}\n\n{text}' | |
with open(f'{path}{h2}.txt', 'w', encoding='utf-8') as f: | |
f.write(text) | |
print(">> 恭喜,保存作品内容成功!") | |
print(f"作品内容保存为:{h2}.txt !") | |
imgs=tree.xpath('//div[@class="ImageElement-root-kir"]/img/@src') | |
print(">> 获取图片成功,图片列表为:") | |
print(imgs) | |
print(">> 正在多线程下载图片,请稍候...") | |
down_imgs(imgs, path, url) | |
time.sleep(5) | |
if __name__=="__main__": | |
print("欢迎使用 Behance 作品采集器 V2.0 -Python与SEO !") | |
print("BUG 反馈-微信:huguo00289!") | |
print("=========================================================") | |
print("!!!注意,需要结合访问国外网站使用!!!" | |
"\n" | |
"需输入采集Behance作品链接" | |
"\n" | |
"粘贴/输入链接后回车运行采集器" | |
"\n") | |
print("=========================================================") | |
url=input("请粘贴/输入要采集的 Behance 作品链接:") | |
print(">> 正在采集,请稍候...") | |
# get_detail(url) #单线程下图片 | |
get_sdetail(url) #多线程下图片方式一 | |
# get_ssdetail(url) #多线程下图片方式二 | |
print("恭喜,Behance 作品采集成功!") | |
print("4s后自动关闭程序!") | |
time.sleep(4) | |