简单的网站写爬虫就跟流水线加工一样,抄抄改改,没有问题就直接上了,直接了当省事,又是一篇没有营养的水文。一个比较简单的爬虫,适合练手学习使用,主要是爬取和采集网站的作品信息,包括标题、内容及图片,其中图片采用了多线程爬取。
考虑到外网爬取,所以采用了三次访问超时重试的机制,同时对于详情页的爬取采用了报错机制跳过处理,适合新人学习爬取使用。小日子的网站随便爬,加大力度,使劲搞,适合 Python 爬虫新人练手使用和学习,如果你正在找练手网站,不妨尝试爬取下载数据。
详情页关键节点处理的代码:
tree = etree.HTML(html) | |
h1=tree.xpath('//h1[@class="entry-title"]/text()')[0] | |
pattern = r"[\/\\\:\*\?\"\<\>\|]" | |
h1=re.sub(pattern, "_", h1) # 替换为下划线 | |
print(h1) | |
path = f'{h1}/' | |
os.makedirs(path, exist_ok=True) | |
print(f">> 生成保存目录 {h1} 文件夹成功!") | |
ptexts=tree.xpath('//div[@class="main-text"]/p/text()') | |
ptext=''.join(ptexts) | |
print(ptext) | |
with open(f'{path}{h1}.txt','w',encoding='utf-8') as f: | |
f.write(f'{h1}\n{ptext}') | |
print(f">> 保存 {h1}.txt 文件成功!") | |
imgs=tree.xpath('//div[@class="slider-for"]/div[@class="sp-slide"]/img/@src') |
文章最后附上早期写的,看看有没有差距和不同之处呢?!
附上完整源码仅供参考学习使用。
# -*- coding: UTF-8 -*- | |
# @公众号:eryeji | |
# https://www.nendo.jp/jp/works/ | |
import requests | |
from lxml import etree | |
import time | |
import random | |
import re | |
import threading | |
import os | |
def get_ua(): | |
ua_list = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', | |
] | |
ua=random.choice(ua_list) | |
return ua | |
def get_hrefs(): | |
url="https://www.nendo.jp/jp/works/" | |
headers={ | |
"User-Agent":get_ua() | |
} | |
response=requests.get(url=url,headers=headers,timeout=6) | |
print(response.status_code) | |
html = response.content.decode('utf-8') | |
#print(html) | |
tree = etree.HTML(html) | |
hrefs = tree.xpath('//div[@class="entry-content"]/a/@href') | |
print(len(hrefs)) | |
print(hrefs) | |
for href in hrefs: | |
get_detail(href) | |
time.sleep(3) | |
def get_detail(href): | |
headers = { | |
"User-Agent": get_ua() | |
} | |
response = requests.get(url=href, headers=headers, timeout=6) | |
print(response.status_code) | |
html = response.content.decode('utf-8') | |
#print(html) | |
tree = etree.HTML(html) | |
h1=tree.xpath('//h1[@class="entry-title"]/text()')[0] | |
pattern = r"[\/\\\:\*\?\"\<\>\|]" | |
h1=re.sub(pattern, "_", h1) # 替换为下划线 | |
print(h1) | |
path = f'{h1}/' | |
os.makedirs(path, exist_ok=True) | |
print(f">> 生成保存目录 {h1} 文件夹成功!") | |
ptexts=tree.xpath('//div[@class="main-text"]/p/text()') | |
ptext=''.join(ptexts) | |
print(ptext) | |
with open(f'{path}{h1}.txt','w',encoding='utf-8') as f: | |
f.write(f'{h1}\n{ptext}') | |
print(f">> 保存 {h1}.txt 文件成功!") | |
imgs=tree.xpath('//div[@class="slider-for"]/div[@class="sp-slide"]/img/@src') | |
print(len(imgs)) | |
print(imgs) | |
down_imgs(path, imgs) | |
# 3次重试 | |
def get_resp(url): | |
i = 0 | |
while i < 4: | |
try: | |
headers = { | |
"User-Agent":get_ua() | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
print(response.status_code) | |
return response | |
except requests.exceptions.RequestException: | |
i += 1 | |
print(f">> 获取网页出错,6S后将重试获取第:{i} 次") | |
time.sleep(i * 2) | |
def down_imgs(path,imgs): | |
threadings = [] | |
for img in imgs: | |
t = threading.Thread(target=get_img, args=(path,img)) | |
threadings.append(t) | |
t.start() | |
for x in threadings: | |
x.join() | |
print(f"恭喜,多线程下载图片完成!") | |
#下载图片 | |
def get_img(path,img_url): | |
img_name = img_url.split('/')[-1] | |
r = get_resp(img_url) | |
time.sleep(1) | |
with open(f'{path}{img_name}', 'wb')as f: | |
f.write(r.content) | |
print(f">> {img_name}下载图片成功") | |
def main(): | |
get_hrefs() | |
if __name__=='__main__': | |
main() |