Python 是一种跨平台的计算机程序设计语言,面向对象动态类型语言,Python是纯粹的自由软件,源代码和解释器CPython遵循 GPL(GNU General Public License)协议,随着版本的不断更新和语言新功能的添加,Python 越来越多被用于独立的、大型项目的开发。
快速抓取网页: 使用urllib最基本的抓取功能,将百度首页的内容保存到本地目录下.
import urllib.request | |
"https://www.baidu.com") | res=urllib.request.urlopen(|
print(res.read().decode("utf-8")) | |
open("./test.html","wb") #保存在本地 | f=|
f.write(res.read()) | |
f.close() |
实现POST请求: 上述的例子是通过请求百度的get请求获得百度,下面使用urllib的post请求.
import urllib.parse | |
import urllib.request | |
bytes(urllib.parse.urlencode({"hello":"lyshark"}),encoding="utf-8") | data=|
print(data) | |
'http://www.baidu.com/post',data=data) | response = urllib.request.urlopen(|
print(response.read()) |
设置TIMEOUT时间: 我们需要给请求设置一个超时时间,而不是让程序一直在等待结果.
import urllib.request | |
response = urllib.request.urlopen('http://www.baidu.com', timeout=1) | |
print(response.read()) |
获取网站状态: 我们可以通过status、getheaders(),getheader("server"),获取状态码以及头部信息.
import urllib.request | |
"https://www.python.org") | res=urllib.request.urlopen(|
print(type(res)) | |
<class 'http.client.HTTPResponse'> | |
res.status | |
res.getheaders() | |
"server") | res.getheader(
伪装访问网站: 给请求添加头部信息,从而定制自己请求网站是时的头部信息,防止被和谐.
from urllib import request,parse | |
url = 'http://www.baidu.com' | |
headers = { | |
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', | |
'Host': 'mkdirs.org' | |
} | |
dict = { | |
'name': 'LyShark' | |
} | |
data = bytes(parse.urlencode(dict), encoding='utf8') | |
req = request.Request(url=url, data=data, headers=headers, method='POST') | |
response = request.urlopen(req) | |
print(response.read().decode('utf-8')) |
简单的URL页面拼接:
import re | |
def Get_Url(target,start,ends): | |
urls=[] | |
for i in range(start,ends): | |
url = target+"/"+str(i) | |
urls.append(url) | |
return urls | |
if __name__ == "__main__": | |
url = Get_Url("https://www.mzitu.com/214261",1,10) | |
print(url) |
request库的使用:
import re | |
import requests | |
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
if __name__ == "__main__": | |
ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1) | |
all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S) | |
print(all_pic_link) |
简单实现爬取图片:
import re | |
import urllib.request | |
def open_url(url): | |
ret = urllib.request.Request(url) | |
ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36') | |
page = urllib.request.urlopen(ret) | |
html =page.read().decode("utf-8") | |
return html | |
def get_img(html): | |
ret = re.findall('<img src="([^"]+\.jpg)"',html) | |
for each in ret: | |
filename = each.split("/")[-1] | |
print("完整路径:",each) | |
print("文件名称:",filename) | |
urllib.request.urlretrieve(each,filename,None) | |
if __name__ == '__main__': | |
url = open_url("https://www.mzitu.com/210402") | |
get_img(url) |
爬每日CVE漏洞列表:
import re | |
import requests | |
from bs4 import BeautifulSoup | |
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
def Get_CVE(url): | |
new_cve = [] | |
ret = requests.get(url=url, headers=head, timeout=3) | |
bs = BeautifulSoup(ret.text, 'html.parser') | |
for i in bs.find_all('a'): | |
href = i.get('href') | |
new_cve.append(href) | |
return(new_cve) | |
def Get_Number(list): | |
new = [] | |
for i in list: | |
temp = re.findall("[0-9]{1,}-.*", str(i)) | |
new.append("CVE-{}".format(temp)) | |
return new | |
if __name__ == "__main__": | |
url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html" | |
cve = Get_CVE(url) | |
number = Get_Number(cve) | |
for i in number: | |
print("今日份的漏洞:",i) |
简单爬取西刺代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.
import re | |
import requests | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3) | |
data = re.findall('<td>.*</td>', ret.text) | |
sum =0 | |
for i in range(0,20): | |
IP = data[sum].replace("<td>","").replace("</td>","") | |
Port = data[sum+1].replace("<td>","").replace("</td>","") | |
Type = data[sum+2].replace("<td>","").replace("</td>","") | |
times = data[sum+3].replace("<td>","").replace("</td>","") | |
year = data[sum+4].replace("<td>","").replace("</td>","") | |
print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year)) | |
sum = sum+5 |
BeautifulSoup 定位技巧: 使用bs库需要安装,三个依赖包 pip install requests bs4 lxml
from bs4 import BeautifulSoup | |
import requests | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
ret = requests.get(url="https://lyshark.cnblogs.com", headers=head, timeout=3) | |
ret.encoding="utf-8" # 出现乱码需要改这里 | |
bs = BeautifulSoup(ret.text,"lxml") | |
# 查找head头节点里面的所有link标签,过滤出0个里面的,href成员 | |
print(bs.select('head link')[0]['href']) | |
# 查找文中所有a标签,且类名是c_b_p_desc_readmore的,并提取出其href字段 | |
print(bs.find_all('a',class_='c_b_p_desc_readmore')[0]['href']) | |
# 提取所有a标签,且id等于blog_nav_admin类等于menu,并提取出其href字段 | |
print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0]['href']) | |
print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0].attrs['href']) | |
# 提取DIV标签里面,id是page_begin_html且里面是link标签的 | |
print(bs.select('div[id="page_begin_html"] link')[0]['href']) | |
print(bs.select('ul[id="navList"] .menu')[0]['href']) | |
# 提取 body 标签下面的 div标签并且匹配id=page_begin_html标签里面第1个link元素 | |
print(bs.select('body > div[id="page_begin_html"] > link')[0]) | |
# 提取指定标签里面的内容 | |
print(bs.select('title')[0].get_text()) | |
print(bs.select('a[href="https://www.cnblogs.com/LyShark/archive/2019/12/04.html"]')) | |
# 定位body标签下面的div下面子标签div下面的span标签 | |
print(bs.select('div[id="header"] div[id="blogTitle"] a[id="lnkBlogLogo"]')) | |
print(bs.select('body div[id="header"] div[class="blogStats"] span[id="stats_post_count"]')) |
stripped_strings方法的简单应用: 提取出house-name标签下面的所有字符串
from bs4 import BeautifulSoup | |
import requests | |
import html5lib | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3) | |
text = str(ret.content.decode('utf-8')) | |
bs = BeautifulSoup(text,"html5lib") | |
ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]') | |
for i in ret: | |
#house = i.get_text() # 提取出文中的所有字符串以及其格式 | |
house = list(i.stripped_strings) # 提取出字符串并以列表的形式返回 | |
print(house) |
实现爬取中国天气网:
from bs4 import BeautifulSoup | |
import requests | |
import html5lib | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
ret = requests.get(url="http://www.weather.com.cn/textFC/shandong.shtml", headers=head, timeout=3) | |
text = str(ret.content.decode('utf-8')) | |
bs = BeautifulSoup(text,"html5lib") | |
bs.find_all('div',class_='conMidtab')[1] # 定位到第一个标签上 | |
tr = bs.find_all('tr')[2:] # 在conMidtab里面找,tr标签并从第3个标签开始保存 | |
for i in tr: | |
td = i.find_all('td') # 循环找代码中的所有td标签 | |
city_td = td[0] # 找所有的td标签,并找出第一个td标签 | |
# stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串 | |
city = list(city_td.stripped_strings)[0] | |
temp = td[-5] # 取出度数的标签 | |
temperature = list(temp.stripped_strings)[0] | |
print('城市:{} 温度:{}'.format(city,temperature)) |
使用bs4库爬取西刺代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.
import re | |
import requests | |
from bs4 import BeautifulSoup | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
ret = requests.get(url="https://www.xicidaili.com/wt/", headers=head, timeout=3) | |
bs = BeautifulSoup(ret.text,"lxml") | |
ret = bs.select('table[id="ip_list"] tr[class="odd"]') | |
ip=[] | |
for i in ret: | |
house =list(i.stripped_strings) | |
ip.append(house) | |
for i in range(0,50): | |
format = "http://{}:{}".format(ip[i][0],ip[i][1]) | |
print(format,file=open("save.log",'a+',encoding='utf-8')) | |
print("代理地址(已保存) {}".format(format)) |
Request使用代理IP地址
import re | |
from time import sleep | |
import requests | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
proxy = { "http":"http://127.0.0.1:9999" } | |
# 无密码写法:"http": "http://ip:端口号" | |
# 有密码写法:"https": "https://username:password@ip:端口号" | |
file = open("save.log","r",encoding="utf-8") | |
for i in file.readlines(): | |
data = "".join(i.split('\n')) # 去除空格 | |
proxy.update(http=data) # 更新proxy中的数据为当前行 | |
ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy) | |
if ret.status_code == 200: | |
print("代理:{} 访问完成".format(proxy["http"])) | |
else: | |
print("代理:{} 不在线,失败".format(proxy["http"])) | |
sleep(1) |
Request代理下载文件
import requests | |
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} | |
proxy = { "http":"http://117.69.200.46:9999" } | |
url = "https://nmap.org/dist/nmap-7.80-win32.zip" | |
ret = requests.get(url=url, headers=head,stream=True,proxies=proxy) | |
fp = open("nmap.zip","wb") | |
for chunk in ret.iter_content(chunk_size=4096): | |
if chunk: | |
print("本次保存长度:{} ".format(len(chunk))) | |
fp.write(chunk) |
简单爬取子域名
import requests | |
import json | |
def GetSubDomain(domain): | |
url = "http://ce.baidu.com/index/getRelatedSites?site_address={}".format(domain) | |
ret = requests.get(url=url) | |
obj = json.loads(ret.text) | |
list = obj.get("data") | |
print("子域名个数:{}".format(len(list))) | |
fp = open("domain.log","w") | |
for item in list: | |
fp.write(item.get("domain")) | |
fp.write("\n") | |
print(item) | |
fp.close() | |
GetSubDomain("qq.com") |
博客园自动备份工具: 自动备份博客园工具,快速备份博客文章包括图片等.
from bs4 import BeautifulSoup | |
import requests,os | |
header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"} | |
def get_url(name,start_page,end_page): | |
title = [] | |
value = [] | |
for x in range(start_page,end_page+1): | |
url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x) | |
response = requests.get(url,headers=header,timeout=5) | |
text = str(response.content.decode("utf-8")) | |
bs = BeautifulSoup(text,"lxml") | |
ret = bs.select('div[class="day"] div[class="postTitle"] a') | |
for item in range(0,10): | |
x = ret[item].get_text().replace("\n","") | |
y = ret[item].get('href').replace("\n","") | |
title.append(x) | |
value.append(y) | |
print("[+] 文章路径: ---> 地址: {} ---> 标题: {}".format(y,x)) | |
return title,value | |
def down_page(page_name,url): | |
params = { "enc": "utf-8" } | |
response = requests.get(url=url,params=params,headers=header) | |
#print(response.encoding) # 打印出所请求页面返回的编码方式 | |
#print(response.apparent_encoding) # 通过内容分析出的编码方式,这里是urf-8 | |
content = response.text.encode(response.encoding).decode(response.apparent_encoding) | |
os.system("mkdir {}".format(page_name)) | |
# 下载页面并放入相应目录下 | |
with open(page_name + "/" + page_name+".html", 'w', encoding='utf-8') as f: | |
f.write(content) | |
# 下载图片相关内容 | |
bs = BeautifulSoup(content, "lxml") | |
ret = bs.select('div[id="cnblogs_post_body"] div[class="left-9-code"] img') | |
for item in range(0,len(ret)): | |
src = ret[item].get("src") | |
src_name = src.split("/")[-1] | |
print("[+] ---> 正在准备下载图片: {} ---> 地址: {}".format(src_name,src)) | |
img = requests.get(url=src, stream=True) | |
with open(page_name + "/" + src_name,'wb') as f: | |
for chunk in img.iter_content(chunk_size=1024): | |
f.write(chunk) | |
if __name__ == '__main__': | |
title,value = get_url("lyshark",1,2) | |
for item in range(0,len(value)): | |
print(title[item]) | |
down_page(title[item].replace(" ",""),value[item]) |
Selenium 自动化测试库的使用:
<html lang="en"> | |
<head> | |
<meta charset="gbk"> | |
<title>Selenium Test</title> | |
</head> | |
<body> | |
<div class="acount" id="aid"> | |
<a class="mnav" href="https://news.baidu.com" name="trnews">新闻</a> | |
<a class="mnav" href="https://lyshark.cnblogs.com" name="myblog">我的博客</a> | |
<a class="mnav" href="https://github.com/lyshark" name="mygit">GitHub</a> | |
</div> | |
<form id="forms" class="fms" name="submit_form" action="index.html"> | |
<span class="soutu-btn"></span> | |
<p>用户: <input id="user" class="s_ipt" name="wd" value="" maxlength="255" autocomplete="off"></p> | |
<p>密码: <input id="pass" class="s_ipt" name="wd" value="" maxlength="255" autocomplete="off"></p> | |
<input type="submit" value="提交" /> | |
</form> | |
<p name="p1" > hello lyshark p1</p> | |
<p name="p2" > hello lyshark p2</p> | |
</body> | |
</html> |
通过简单的浏览文件并实现简单的定位.
# 驱动下载地址: http://chromedriver.storage.googleapis.com/index.html | |
from selenium import webdriver | |
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe" | |
driver = webdriver.Chrome(executable_path=WebPath) | |
driver.set_window_size(1024,768) | |
# 常用的定位变量参数如下所示. | |
driver.get("http://lyshark.com") | |
print("当前URL: {}".format(driver.current_url)) | |
print("当前标题: {}".format(driver.title)) | |
print("网页代码: {}".format(driver.page_source)) | |
# 基本的 find_element 标签查找定位方式 | |
print(driver.find_element_by_id("user")) # 通过ID来查找元素 | |
print(driver.find_element_by_name("p1").text) # 通过name属性来定位 | |
print(driver.find_element_by_class_name("s_ipt")) # 通过类名来定位 | |
# 通过xpath定位,xpath定位有N种写法,这里列几个常用写法 | |
print(driver.find_element_by_xpath("//form[@class='fms']//input[@id='user']")) | |
print(driver.find_element_by_xpath("//p[@name='p1']")) | |
print(driver.find_element_by_xpath("//html/body/form/p/input")) | |
print(driver.find_elements_by_css_selector(".fms #user")) | |
# 定位a标签中的关键字. | |
print(driver.find_element_by_link_text("新闻")) | |
print(driver.find_element_by_partial_link_text("我")) |
简单实现多个标签之间互相切换
# -*- coding:utf-8 -*- | |
from selenium import webdriver | |
import time | |
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe" | |
driver = webdriver.Chrome(executable_path=WebPath) | |
driver.set_window_size(1024,768) | |
driver.get("https://www.baidu.com") | |
driver.find_element_by_id("kw").send_keys("lyshark") # 发送给id=kw的编辑框,搜索关键字 lyshark | |
driver.find_element_by_id("su").click() # 点击搜索按钮,百度一下的ID是su | |
time.sleep(1) | |
# xpath 语法 寻找 div id是1里面的 a标签取出标签中的 contains text() | |
driver.find_element_by_xpath("//div[@id='1']//a[contains(text(),'-')]").click() | |
time.sleep(1) | |
handle = driver.current_window_handle # 获取当前窗口句柄 | |
handle_all = driver.window_handles # 获取当前所有开启窗口的句柄 | |
print(handle_all) | |
driver.switch_to.window(handle_all[0]) # 切换到第一个窗口中 | |
time.sleep(1) | |
driver.find_element_by_id("kw").clear() # 接着清空搜索框中的内容 |
通过xpath定位标签并自动输入内容,发送登录请求到后端,写法如下.
from selenium import webdriver | |
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe" | |
driver = webdriver.Chrome(executable_path=WebPath) | |
driver.set_window_size(1024,768) | |
driver.get("http://lyshark.com") | |
# 通过xpath语法定位到用户名的标签上并且自动输入lyshark这个用户名 | |
driver.find_element_by_xpath("//form[@class='fms']/p//input[@id='user']").send_keys("lyshark") | |
# 通过xpath语法定位到密码的标签上清空默认值,然后输入123123密码 | |
driver.find_element_by_xpath("//form[@class='fms']/p//input[@id='pass']").clear() | |
driver.find_element_by_xpath("//form[@class='fms']/p//input[@id='pass']").send_keys("123123") | |
# 提交这个请求,默认有两种提交方式一种是 click() 一种是submit() | |
driver.find_element_by_xpath("//form[@class='fms']/input[@type='submit']").click() |
通过类库实现模拟键盘鼠标操作记录.
from selenium import webdriver | |
from selenium.webdriver import ActionChains | |
from selenium.webdriver.common.keys import Keys | |
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe" | |
driver = webdriver.Chrome(executable_path=WebPath) | |
driver.set_window_size(1024,768) | |
driver.get("https://www.baidu.com") | |
# ------------------------------------------------------------------------ | |
# ActionChains 类提供了鼠标操作的常用方法,鼠标事件的常用函数说明 | |
# perform(): 鼠标悬浮于标签 | |
# context_click(): 右击 | |
# double_click(): 双击 | |
# drag_and_drop(): 拖动 | |
# move_to_element():鼠标悬停 | |
# 定位到要悬停的元素 | |
above = driver.find_element_by_link_text("更多产品") | |
# 对定位到的元素执行鼠标悬停操作 | |
ActionChains(driver).move_to_element(above).perform() | |
# ------------------------------------------------------------------------ | |
# webdriver.common.keys 类提供了键盘事件的操作,以下为常用的键盘操作: | |
# send_keys(Keys.BACK_SPACE) 删除键(BackSpace) | |
# send_keys(Keys.SPACE) 空格键(Space) | |
# send_keys(Keys.TAB) 制表键(Tab) | |
# send_keys(Keys.ESCAPE) 回退键(Esc) | |
# send_keys(Keys.ENTER) 回车键(Enter) | |
# send_keys(Keys.CONTROL,'a') 全选(Ctrl+A) | |
# send_keys(Keys.CONTROL,'c') 复制(Ctrl+C) | |
# send_keys(Keys.CONTROL,'x') 剪切(Ctrl+X) | |
# send_keys(Keys.CONTROL,'v') 粘贴(Ctrl+V) | |
# send_keys(Keys.F1) 键盘 F1 | |
# 输入框输入内容 | |
driver.find_element_by_id("kw").send_keys("seleniumm") | |
# 删除多输入的一个 m | |
driver.find_element_by_id("kw").send_keys(Keys.BACK_SPACE) | |
# 输入空格键+从入门到入土 | |
driver.find_element_by_id("kw").send_keys(Keys.SPACE) | |
driver.find_element_by_id("kw").send_keys("从入门到入土") | |
# ctrl+a 全选输入框内容 | |
driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'a') | |
# ctrl+x 剪切输入框内容 | |
driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'x') | |
# ctrl+v 粘贴内容到输入框 | |
driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'v') | |
# 通过回车键来代替单击操作 | |
driver.find_element_by_id("su").send_keys(Keys.ENTER) |
通过selenium模块配合自动按键即可实现简单的博客园自动爬行工具,用于备份非常不错.
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import requests,os,time,lxml | |
import win32api,win32con | |
header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"} | |
def get_url(name,start_page,end_page): | |
value = [] | |
for x in range(start_page,end_page+1): | |
url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x) | |
response = requests.get(url,headers=header,timeout=5) | |
text = str(response.content.decode("utf-8")) | |
bs = BeautifulSoup(text,"lxml") | |
ret = bs.select('div[class="day"] div[class="postTitle"] a') | |
for item in range(0,10): | |
y = ret[item].get('href').replace("\n","") | |
value.append(y) | |
print("[+] 爬行地址: {} ".format(y)) | |
return value | |
if __name__ == "__main__": | |
value = get_url("csnd",1,2) | |
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe" | |
driver = webdriver.Chrome(executable_path=WebPath) | |
driver.set_window_size(1024,768) | |
for item in range(0,len(value)): | |
print("[-] ---> 开始保存:{}".format(value[item])) | |
driver.get(value[item]) | |
# 按下ctrl+s | |
win32api.keybd_event(0x11, 0, 0, 0) | |
win32api.keybd_event(0x53, 0, 0, 0) | |
win32api.keybd_event(0x53, 0, win32con.KEYEVENTF_KEYUP, 0) | |
win32api.keybd_event(0x11, 0, win32con.KEYEVENTF_KEYUP, 0) | |
# 按下回车 | |
time.sleep(1) | |
win32api.keybd_event(0x0D, 0, 0, 0) | |
win32api.keybd_event(0x0D, 0, win32con.KEYEVENTF_KEYUP, 0) |
html parser
#定义一个MyParser继承自HTMLParser | |
class MyParser(HTMLParser): | |
re=[]#放置结果 | |
flg=0#标志,用以标记是否找到我们需要的标签 | |
def handle_starttag(self, tag, attrs): | |
if tag=='h3':#目标标签 | |
for attr in attrs: | |
if attr[0]=='class' and attr[1]=='tb-main-title':#目标标签具有的属性 | |
self.flg=1#符合条件则将标志设置为1 | |
break | |
else: | |
pass | |
def handle_data(self, data): | |
if self.flg==1: | |
self.re.append(data.strip())#如果标志为我们需要的标志,则将数据添加到列表中 | |
self.flg=0#重置标志,进行下次迭代 | |
else: | |
pass | |
my=MyParser() | |
my.feed(html) |