利用爬虫获取当前博文数量与字数

Python
347
0
0
2022-04-21
标签   Python爬虫

由于个人博客没有博文统计的功能,于是自己手写了一个爬虫,用于获取当前博文数量与字数,具体的思路就是先获取整个文章列表,然后遍历文章来统计数量与字数

import requests
from lxml import etree
import re
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def post_statistic():
    start_url = 'https://silencehuliang.github.io/posts/'
    response = requests.get(start_url).content.decode()
    html = etree.HTML(response)
    page = int(html.xpath('//ul [@class="pagination"]/li[5]//a/text()')[0])
    print('当前博客总页数为:{}'.format(page))
    archive_count = len(html.xpath('//article [@class="archive-item"]'))
    post_url_list = html.xpath('//a [@class="archive-item-link"]/@href')
    for i in range(1, page + 1):
        print("开始访问第{}页".format(i))
        next_url = 'https://silencehuliang.github.io/posts/page/{}/'.format(i)
        response = requests.get(next_url).content.decode()
        html = etree.HTML(response)
        archive_count += len(html.xpath('//article [@class="archive-item"]'))
        post_url_list.extend(html.xpath('//a [@class="archive-item-link"]/@href'))
    num = 0
    for p_url in post_url_list:
        post_url = 'https://silencehuliang.github.io' + p_url
        response = requests.get(post_url).content.decode()
        html = etree.HTML(response)
        num += int(re.findall('约 (\d+) 字', html.xpath('//div [@class="post-meta"]/div[2]/text()[4]')[0])[0])
    print("目前博文数量为:{},总字数为:{}".format(archive_count, num))


if __name__ == '__main__':
    post_statistic()