Python 爬虫，Nendo 网站作品信息采集爬虫源码！

简单的网站写爬虫就跟流水线加工一样，抄抄改改，没有问题就直接上了，直接了当省事，又是一篇没有营养的水文。一个比较简单的爬虫，适合练手学习使用，主要是爬取和采集网站的作品信息，包括标题、内容及图片，其中图片采用了多线程爬取。

考虑到外网爬取，所以采用了三次访问超时重试的机制，同时对于详情页的爬取采用了报错机制跳过处理，适合新人学习爬取使用。小日子的网站随便爬，加大力度，使劲搞，适合 Python 爬虫新人练手使用和学习，如果你正在找练手网站，不妨尝试爬取下载数据。

详情页关键节点处理的代码：

 
tree = etree.HTML(html)
    h1=tree.xpath('//h1[@class="entry-title"]/text()')[0]
    pattern = r"[\/\\\:\*\?\"\<\>\|]"
    h1=re.sub(pattern, "_", h1)  # 替换为下划线
    print(h1)
    path = f'{h1}/'
    os.makedirs(path, exist_ok=True)
    print(f">> 生成保存目录 {h1} 文件夹成功！")
    ptexts=tree.xpath('//div[@class="main-text"]/p/text()')
    ptext=''.join(ptexts)
    print(ptext)
    with open(f'{path}{h1}.txt','w',encoding='utf-8') as f:
        f.write(f'{h1}\n{ptext}')
    print(f">> 保存 {h1}.txt 文件成功！")
    imgs=tree.xpath('//div[@class="slider-for"]/div[@class="sp-slide"]/img/@src')

文章最后附上早期写的，看看有没有差距和不同之处呢？!

附上完整源码仅供参考学习使用。

 
# -*- coding: UTF-8 -*-
# @公众号：eryeji
# https://www.nendo.jp/jp/works/
 
import requests
from lxml import etree
import time
import random
import re
import threading
import os
 
 
def get_ua():
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    ]
    ua=random.choice(ua_list)
    return ua
 
 
def get_hrefs():
    url="https://www.nendo.jp/jp/works/"
    headers={
        "User-Agent":get_ua()
    }
    response=requests.get(url=url,headers=headers,timeout=6)
    print(response.status_code)
    html = response.content.decode('utf-8')
    #print(html)
    tree = etree.HTML(html)
    hrefs = tree.xpath('//div[@class="entry-content"]/a/@href')
    print(len(hrefs))
    print(hrefs)
    for href in hrefs:
        get_detail(href)
        time.sleep(3)
 
 
 
def get_detail(href):
    headers = {
        "User-Agent": get_ua()
    }
    response = requests.get(url=href, headers=headers, timeout=6)
    print(response.status_code)
    html = response.content.decode('utf-8')
    #print(html)
    tree = etree.HTML(html)
    h1=tree.xpath('//h1[@class="entry-title"]/text()')[0]
    pattern = r"[\/\\\:\*\?\"\<\>\|]"
    h1=re.sub(pattern, "_", h1)  # 替换为下划线
    print(h1)
    path = f'{h1}/'
    os.makedirs(path, exist_ok=True)
    print(f">> 生成保存目录 {h1} 文件夹成功！")
    ptexts=tree.xpath('//div[@class="main-text"]/p/text()')
    ptext=''.join(ptexts)
    print(ptext)
    with open(f'{path}{h1}.txt','w',encoding='utf-8') as f:
        f.write(f'{h1}\n{ptext}')
    print(f">> 保存 {h1}.txt 文件成功！")
    imgs=tree.xpath('//div[@class="slider-for"]/div[@class="sp-slide"]/img/@src')
    print(len(imgs))
    print(imgs)
    down_imgs(path, imgs)
 
 
 
 
# 3次重试
def get_resp(url):
    i = 0
    while i < 4:
        try:
            headers = {
               "User-Agent":get_ua()
            }
            response = requests.get(url, headers=headers, timeout=10)
            print(response.status_code)
            return response
        except requests.exceptions.RequestException:
            i += 1
            print(f">> 获取网页出错，6S后将重试获取第：{i} 次")
            time.sleep(i * 2)
 
 
 
def down_imgs(path,imgs):
    threadings = []
    for img in imgs:
        t = threading.Thread(target=get_img, args=(path,img))
        threadings.append(t)
        t.start()
 
    for x in threadings:
        x.join()
 
    print(f"恭喜，多线程下载图片完成!")
 
 
#下载图片
def get_img(path,img_url):
    img_name = img_url.split('/')[-1]
    r = get_resp(img_url)
    time.sleep(1)
    with open(f'{path}{img_name}', 'wb')as f:
        f.write(r.content)
    print(f">> {img_name}下载图片成功")
 
 
def main():
    get_hrefs()
 
 
 
 
if __name__=='__main__':
    main()

	tree = etree.HTML(html)
	h1=tree.xpath('//h1[@class="entry-title"]/text()')[0]
	pattern = r"[\/\\\:\*\?\"\<\>\\|]"
	h1=re.sub(pattern, "_", h1) # 替换为下划线
	print(h1)
	path = f'{h1}/'
	os.makedirs(path, exist_ok=True)
	print(f">> 生成保存目录 {h1} 文件夹成功！")
	ptexts=tree.xpath('//div[@class="main-text"]/p/text()')
	ptext=''.join(ptexts)
	print(ptext)
	with open(f'{path}{h1}.txt','w',encoding='utf-8') as f:
	f.write(f'{h1}\n{ptext}')
	print(f">> 保存 {h1}.txt 文件成功！")
	imgs=tree.xpath('//div[@class="slider-for"]/div[@class="sp-slide"]/img/@src')

	# -- coding: UTF-8 --
	# @公众号：eryeji
	# https://www.nendo.jp/jp/works/

	import requests
	from lxml import etree
	import time
	import random
	import re
	import threading
	import os


	def get_ua():
	ua_list = [
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
	'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
	'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
	'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
	]
	ua=random.choice(ua_list)
	return ua


	def get_hrefs():
	url="https://www.nendo.jp/jp/works/"
	headers={
	"User-Agent":get_ua()
	}
	response=requests.get(url=url,headers=headers,timeout=6)
	print(response.status_code)
	html = response.content.decode('utf-8')
	#print(html)
	tree = etree.HTML(html)
	hrefs = tree.xpath('//div[@class="entry-content"]/a/@href')
	print(len(hrefs))
	print(hrefs)
	for href in hrefs:
	get_detail(href)
	time.sleep(3)



	def get_detail(href):
	headers = {
	"User-Agent": get_ua()
	}
	response = requests.get(url=href, headers=headers, timeout=6)
	print(response.status_code)
	html = response.content.decode('utf-8')
	#print(html)
	tree = etree.HTML(html)
	h1=tree.xpath('//h1[@class="entry-title"]/text()')[0]
	pattern = r"[\/\\\:\*\?\"\<\>\\|]"
	h1=re.sub(pattern, "_", h1) # 替换为下划线
	print(h1)
	path = f'{h1}/'
	os.makedirs(path, exist_ok=True)
	print(f">> 生成保存目录 {h1} 文件夹成功！")
	ptexts=tree.xpath('//div[@class="main-text"]/p/text()')
	ptext=''.join(ptexts)
	print(ptext)
	with open(f'{path}{h1}.txt','w',encoding='utf-8') as f:
	f.write(f'{h1}\n{ptext}')
	print(f">> 保存 {h1}.txt 文件成功！")
	imgs=tree.xpath('//div[@class="slider-for"]/div[@class="sp-slide"]/img/@src')
	print(len(imgs))
	print(imgs)
	down_imgs(path, imgs)




	# 3次重试
	def get_resp(url):
	i = 0
	while i < 4:
	try:
	headers = {
	"User-Agent":get_ua()
	}
	response = requests.get(url, headers=headers, timeout=10)
	print(response.status_code)
	return response
	except requests.exceptions.RequestException:
	i += 1
	print(f">> 获取网页出错，6S后将重试获取第：{i} 次")
	time.sleep(i * 2)



	def down_imgs(path,imgs):
	threadings = []
	for img in imgs:
	t = threading.Thread(target=get_img, args=(path,img))
	threadings.append(t)
	t.start()

	for x in threadings:
	x.join()

	print(f"恭喜，多线程下载图片完成!")


	#下载图片
	def get_img(path,img_url):
	img_name = img_url.split('/')[-1]
	r = get_resp(img_url)
	time.sleep(1)
	with open(f'{path}{img_name}', 'wb')as f:
	f.write(r.content)
	print(f">> {img_name}下载图片成功")


	def main():
	get_hrefs()




	if __name__=='__main__':
	main()