前面有分享过requests采集Behance 作品信息的爬虫,这篇带来另一个版本供参考,使用的是无头浏览器 selenium 采集,主要的不同方式是使用 selenium 驱动浏览器获取到页面源码,后面获取信息的话与前篇一致。
理论上,几乎所有的页面内容都可以采用无头浏览器来获取,不过考虑到采集页面的效率问题,还是不怎么推荐和建议,不过自动化测试或者是其他方面的应用,可以尝试考虑使用,比如 so long a gigo 本渣渣就有分享过淘宝抢购以及百度刷快排的源码,感兴趣,也可以移动浏览,仅供参考学习使用。
附上几个关键点,供参考。
selenium 配置
由于 selenium 需要驱动浏览器,webdriver 版本号一定要对应,一是对应浏览器,二是对应浏览器版本,这里本渣渣用的是谷歌 chromedriver.exe 。
配置参考:
chromedriver_path = r"D:\chromedriver_win32\chromedriver.exe" # 完整路径 | |
options = webdriver.ChromeOptions() # 配置 chrome 启动属性 | |
# options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) #不加载图片,加快访问速度 | |
options.add_experimental_option("excludeSwitches", ['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium | |
browser = webdriver.Chrome(executable_path=chromedriver_path, options=options) |
浏览器页面下拉
驱动无头浏览器获取整个页面源码,下拉操作是必须的,这里本渣渣使用了随机生成数字来控制下拉页面,仅供参考。
# 滑到页面 | |
js = "var q=document.documentElement.scrollTop=5000" | |
browser.execute_script(js) | |
time.sleep(4) | |
# 生成一个包含10个0到100之间不重复随机数的列表 | |
lst = random.sample(range(10), 5) | |
print(lst) | |
for i in lst: | |
js = f"var q=document.documentElement.scrollTop={i*500}" | |
browser.execute_script(js) | |
time.sleep(i) | |
# #滑到底部# js = "var q=document.documentElement.scrollTop=100000"# browser.execute_script(js)# time.sleep(2) | |
# #滑到顶部# js = "var q=document.documentElement.scrollTop=0"# browser.execute_script(js)# time.sleep(3) | |
附完整源码供参考:
# -*- coding: UTF-8 -*- | |
import requests,re,time | |
from lxml import etree | |
from selenium import webdriver | |
import random | |
import os | |
def tp(path,img_url,url): | |
ua_list = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', | |
] | |
headers = { | |
'Connection': 'close', | |
'referer': url, | |
"User-Agent": random.choice(ua_list) | |
} | |
img_name=img_url.split('/')[-1] | |
r = requests.get(img_url,headers=headers,timeout=10) | |
time.sleep(1) | |
with open(f'{path}{img_name}','wb')as f: | |
f.write(r.content) | |
print(f"{img_name}下载图片成功") | |
def get_detail(url): | |
# chromedriver_path = r"C:\Users\Administrator\AppData\Local\Programs\Python\Python37\chromedriver.exe" # 完整路径 | |
chromedriver_path = r"D:\chromedriver_win32\chromedriver.exe" # 完整路径 | |
options = webdriver.ChromeOptions() # 配置 chrome 启动属性# options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) #不加载图片,加快访问速度 | |
options.add_experimental_option("excludeSwitches", ['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium | |
browser = webdriver.Chrome(executable_path=chromedriver_path, options=options) | |
browser.get(url) | |
time.sleep(10) | |
# 滑到页面 | |
js = "var q=document.documentElement.scrollTop=5000" | |
browser.execute_script(js) | |
time.sleep(4) | |
#获取网页标题 | |
h2=browser.title | |
print(h2) | |
h2 = h2.strip() | |
h2 = h2.replace('|', '') | |
pattern = r"[\/\\\:\*\?\"\<\>\|]" | |
h2 = re.sub(pattern, "_", h2) # 替换为下划线print(h2) | |
path=f'{h2}/' | |
os.makedirs(path,exist_ok=True) | |
# 生成一个包含10个0到100之间不重复随机数的列表 | |
lst = random.sample(range(10), 5) | |
print(lst) | |
for i in lst: | |
js = f"var q=document.documentElement.scrollTop={i*500}" | |
browser.execute_script(js) | |
time.sleep(i) | |
# #滑到底部# js = "var q=document.documentElement.scrollTop=100000"# browser.execute_script(js)# time.sleep(2)## #滑到顶部# js = "var q=document.documentElement.scrollTop=0"# browser.execute_script(js)# time.sleep(3) | |
# 打印当前网页源码 | |
html = browser.page_source | |
with open(f'{path}{h2}.html', 'w', encoding='utf-8') as f: | |
f.write(html) | |
get_html(path,h2,html,url) | |
#解析页面 | |
def get_html(path,h2,html,url): | |
tree=etree.HTML(html) | |
main_text = tree.xpath('//div[@class="main-text"]//span/text()') | |
text = '\n'.join(main_text) | |
text = f'{h2}\n\n{text}'with open(f'{path}{h2}.txt', 'w', encoding='utf-8') as f: | |
f.write(text) | |
imgs=tree.xpath('//div[@class="ImageElement-root-kir ImageElement-loaded-icR"]/img/@src') | |
print(imgs) | |
for img in imgs: | |
try: | |
tp(path,img, url) | |
except Exception as e: | |
print(e) | |
time.sleep(4) | |
try: | |
tp(path, img, url) | |
except Exception as e: | |
print(e) | |
time.sleep(6) | |
pass | |
time.sleep(5) | |
if __name__=="__main__": | |
url="https://www.behance.net/gallery/174203985/Squashy?tracking_source=search_projects_recommended"#url = "https://www.behance.net/gallery/152037689/CLIO-DEWY-BLUR-TINT?tracking_source=search_projects" | |
get_detail(url) |