由于个人博客没有博文统计的功能,于是自己手写了一个爬虫,用于获取当前博文数量与字数,具体的思路就是先获取整个文章列表,然后遍历文章来统计数量与字数
import requests
from lxml import etree
import re
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def post_statistic():
start_url = 'https://silencehuliang.github.io/posts/'
response = requests.get(start_url).content.decode()
html = etree.HTML(response)
page = int(html.xpath('//ul [@class="pagination"]/li[5]//a/text()')[0])
print('当前博客总页数为:{}'.format(page))
archive_count = len(html.xpath('//article [@class="archive-item"]'))
post_url_list = html.xpath('//a [@class="archive-item-link"]/@href')
for i in range(1, page + 1):
print("开始访问第{}页".format(i))
next_url = 'https://silencehuliang.github.io/posts/page/{}/'.format(i)
response = requests.get(next_url).content.decode()
html = etree.HTML(response)
archive_count += len(html.xpath('//article [@class="archive-item"]'))
post_url_list.extend(html.xpath('//a [@class="archive-item-link"]/@href'))
num = 0
for p_url in post_url_list:
post_url = 'https://silencehuliang.github.io' + p_url
response = requests.get(post_url).content.decode()
html = etree.HTML(response)
num += int(re.findall('约 (\d+) 字', html.xpath('//div [@class="post-meta"]/div[2]/text()[4]')[0])[0])
print("目前博文数量为:{},总字数为:{}".format(archive_count, num))
if __name__ == '__main__':
post_statistic()