Python Requests 实现简单网络请求

Python 是一种跨平台的计算机程序设计语言，面向对象动态类型语言，Python是纯粹的自由软件,源代码和解释器CPython遵循 GPL(GNU General Public License)协议，随着版本的不断更新和语言新功能的添加，Python 越来越多被用于独立的、大型项目的开发。

快速抓取网页: 使用urllib最基本的抓取功能,将百度首页的内容保存到本地目录下.

 
>>> import urllib.request
>>>
>>> res=urllib.request.urlopen("https://www.baidu.com")
>>> print(res.read().decode("utf-8"))
 
>>> f=open("./test.html","wb")      #保存在本地
>>> f.write(res.read())
>>> f.close()

实现POST请求: 上述的例子是通过请求百度的get请求获得百度,下面使用urllib的post请求.

 
>>> import urllib.parse
>>> import urllib.request
>>>
>>> data=bytes(urllib.parse.urlencode({"hello":"lyshark"}),encoding="utf-8")
>>> print(data)
>>> response = urllib.request.urlopen('http://www.baidu.com/post',data=data)
>>> print(response.read())

设置TIMEOUT时间: 我们需要给请求设置一个超时时间,而不是让程序一直在等待结果.

 
import urllib.request
 
response = urllib.request.urlopen('http://www.baidu.com', timeout=1)
print(response.read())

获取网站状态: 我们可以通过status、getheaders(),getheader("server"),获取状态码以及头部信息.

 
>>> import urllib.request
>>>
>>> res=urllib.request.urlopen("https://www.python.org")
>>> print(type(res))
<class 'http.client.HTTPResponse'>
>>>
>>> res.status
>>> res.getheaders()
>>> res.getheader("server")

伪装访问网站: 给请求添加头部信息,从而定制自己请求网站是时的头部信息,防止被和谐.

 
from urllib import request,parse
 
url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'mkdirs.org'
}
dict = {
    'name': 'LyShark'
}
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

简单的URL页面拼接:

 
import re
 
def Get_Url(target,start,ends):
    urls=[]
    for i in range(start,ends):
        url = target+"/"+str(i)
        urls.append(url)
    return urls
 
if __name__ == "__main__":
    url = Get_Url("https://www.mzitu.com/214261",1,10)
    print(url)

request库的使用:

 
import re
import requests
 
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
 
if __name__ == "__main__":
    ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1)
    all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S)
    print(all_pic_link)

简单实现爬取图片:

 
import re
import urllib.request
 
def open_url(url):
    ret = urllib.request.Request(url)
    ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36')
    page = urllib.request.urlopen(ret)
    html =page.read().decode("utf-8")
    return html
 
def get_img(html):
    ret = re.findall('<img src="([^"]+\.jpg)"',html)
    for each in ret:
        filename = each.split("/")[-1]
        print("完整路径:",each)
        print("文件名称:",filename)
        urllib.request.urlretrieve(each,filename,None)
 
if __name__ == '__main__':
    url = open_url("https://www.mzitu.com/210402")
    get_img(url)

爬每日CVE漏洞列表:

 
import re
import requests
from bs4 import BeautifulSoup
 
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
 
def Get_CVE(url):
    new_cve = []
    ret = requests.get(url=url, headers=head, timeout=3)
    bs = BeautifulSoup(ret.text, 'html.parser')
    for i in bs.find_all('a'):
        href = i.get('href')
        new_cve.append(href)
    return(new_cve)
 
def Get_Number(list):
    new = []
    for i in list:
        temp = re.findall("[0-9]{1,}-.*", str(i))
        new.append("CVE-{}".format(temp))
    return new
 
if __name__ == "__main__":
    url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html"
    cve = Get_CVE(url)
    number = Get_Number(cve)
    for i in number:
        print("今日份的漏洞:",i)

简单爬取西刺代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.

 
import re
import requests
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3)
data = re.findall('<td>.*</td>', ret.text)
sum =0
for i in range(0,20):
    IP = data[sum].replace("<td>","").replace("</td>","")
    Port = data[sum+1].replace("<td>","").replace("</td>","")
    Type = data[sum+2].replace("<td>","").replace("</td>","")
    times = data[sum+3].replace("<td>","").replace("</td>","")
    year = data[sum+4].replace("<td>","").replace("</td>","")
    print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year))
    sum = sum+5

BeautifulSoup 定位技巧: 使用bs库需要安装,三个依赖包 pip install requests bs4 lxml

 
from bs4 import BeautifulSoup
import requests
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://lyshark.cnblogs.com", headers=head, timeout=3)
ret.encoding="utf-8"             # 出现乱码需要改这里
bs = BeautifulSoup(ret.text,"lxml")
 
# 查找head头节点里面的所有link标签,过滤出0个里面的,href成员
 print(bs.select('head link')[0]['href'])
 
# 查找文中所有a标签,且类名是c_b_p_desc_readmore的,并提取出其href字段
 print(bs.find_all('a',class_='c_b_p_desc_readmore')[0]['href'])
 
# 提取所有a标签，且id等于blog_nav_admin类等于menu，并提取出其href字段
 print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0]['href'])
 print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0].attrs['href'])
 
# 提取DIV标签里面,id是page_begin_html且里面是link标签的
 print(bs.select('div[id="page_begin_html"] link')[0]['href'])
 print(bs.select('ul[id="navList"] .menu')[0]['href'])
 
 # 提取 body 标签下面的 div标签并且匹配id=page_begin_html标签里面第1个link元素
 print(bs.select('body > div[id="page_begin_html"] > link')[0])
 
# 提取指定标签里面的内容
 print(bs.select('title')[0].get_text())
 print(bs.select('a[href="https://www.cnblogs.com/LyShark/archive/2019/12/04.html"]'))
 
 # 定位body标签下面的div下面子标签div下面的span标签
 print(bs.select('div[id="header"] div[id="blogTitle"] a[id="lnkBlogLogo"]'))
 print(bs.select('body div[id="header"] div[class="blogStats"] span[id="stats_post_count"]'))

stripped_strings方法的简单应用: 提取出house-name标签下面的所有字符串

 
from bs4 import BeautifulSoup
import requests
import html5lib
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3)
text = str(ret.content.decode('utf-8'))
 
bs = BeautifulSoup(text,"html5lib")
ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]')
for i in ret:
    #house = i.get_text()             # 提取出文中的所有字符串以及其格式
    house = list(i.stripped_strings)   # 提取出字符串并以列表的形式返回
    print(house)

实现爬取中国天气网:

 
from bs4 import BeautifulSoup
import requests
import html5lib
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="http://www.weather.com.cn/textFC/shandong.shtml", headers=head, timeout=3)
text = str(ret.content.decode('utf-8'))
 
bs = BeautifulSoup(text,"html5lib")
bs.find_all('div',class_='conMidtab')[1]   # 定位到第一个标签上
tr = bs.find_all('tr')[2:] # 在conMidtab里面找，tr标签并从第3个标签开始保存
for i in tr:
    td = i.find_all('td')  # 循环找代码中的所有td标签
    city_td = td[0]        # 找所有的td标签,并找出第一个td标签
    # stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串
    city = list(city_td.stripped_strings)[0]
    temp = td[-5]          # 取出度数的标签
    temperature = list(temp.stripped_strings)[0]
    print('城市:{}   温度:{}'.format(city,temperature))

使用bs4库爬取西刺代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.

 
import re
import requests
from bs4 import BeautifulSoup
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://www.xicidaili.com/wt/", headers=head, timeout=3)
bs = BeautifulSoup(ret.text,"lxml")
ret = bs.select('table[id="ip_list"] tr[class="odd"]')
 
ip=[]
for i in ret:
    house =list(i.stripped_strings)
    ip.append(house)
 
for i in range(0,50):
    format = "http://{}:{}".format(ip[i][0],ip[i][1])
    print(format,file=open("save.log",'a+',encoding='utf-8'))
    print("代理地址(已保存) {}".format(format))

Request使用代理IP地址

 
import re
from time import sleep
import requests
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
proxy = { "http":"http://127.0.0.1:9999" }
# 无密码写法："http": "http://ip:端口号"
# 有密码写法："https": "https://username:password@ip:端口号"
 
file = open("save.log","r",encoding="utf-8")
for i in file.readlines():
    data = "".join(i.split('\n'))   # 去除空格
    proxy.update(http=data)         # 更新proxy中的数据为当前行
    ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy)
    if ret.status_code == 200:
        print("代理:{}  访问完成".format(proxy["http"]))
    else:
        print("代理:{}  不在线,失败".format(proxy["http"]))
    sleep(1)

Request代理下载文件

 
import requests
 
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
proxy = { "http":"http://117.69.200.46:9999" }
 
url = "https://nmap.org/dist/nmap-7.80-win32.zip"
 
ret = requests.get(url=url, headers=head,stream=True,proxies=proxy)
fp = open("nmap.zip","wb")
 
for chunk in ret.iter_content(chunk_size=4096):
    if chunk:
        print("本次保存长度:{} ".format(len(chunk)))
        fp.write(chunk)

简单爬取子域名

 
import requests
import json
 
def GetSubDomain(domain):
    url = "http://ce.baidu.com/index/getRelatedSites?site_address={}".format(domain)
    ret = requests.get(url=url)
    obj = json.loads(ret.text)
    list = obj.get("data")
    print("子域名个数：{}".format(len(list)))
 
    fp = open("domain.log","w")
    for item in list:
        fp.write(item.get("domain"))
        fp.write("\n")
        print(item)
    fp.close()
 
GetSubDomain("qq.com")

博客园自动备份工具: 自动备份博客园工具,快速备份博客文章包括图片等.

 
from bs4 import BeautifulSoup
import requests,os
 
header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"}
 
def get_url(name,start_page,end_page):
    title = []
    value = []
    for x in range(start_page,end_page+1):
        url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x)
        response = requests.get(url,headers=header,timeout=5)
        text = str(response.content.decode("utf-8"))
        bs = BeautifulSoup(text,"lxml")
        ret = bs.select('div[class="day"] div[class="postTitle"] a')
        for item in range(0,10):
            x = ret[item].get_text().replace("\n","")
            y = ret[item].get('href').replace("\n","")
            title.append(x)
            value.append(y)
            print("[+] 文章路径: ---> 地址: {} ---> 标题: {}".format(y,x))
    return title,value
 
def down_page(page_name,url):
    params = { "enc": "utf-8" }
    response = requests.get(url=url,params=params,headers=header)
    #print(response.encoding)            # 打印出所请求页面返回的编码方式
    #print(response.apparent_encoding)   # 通过内容分析出的编码方式，这里是urf-8
    content = response.text.encode(response.encoding).decode(response.apparent_encoding)
 
    os.system("mkdir {}".format(page_name))
    # 下载页面并放入相应目录下
    with open(page_name + "/" + page_name+".html", 'w', encoding='utf-8') as f:
        f.write(content)
 
    # 下载图片相关内容
    bs = BeautifulSoup(content, "lxml")
    ret = bs.select('div[id="cnblogs_post_body"] div[class="left-9-code"] img')
    for item in range(0,len(ret)):
        src = ret[item].get("src")
        src_name = src.split("/")[-1]
        print("[+] ---> 正在准备下载图片: {} ---> 地址: {}".format(src_name,src))
        img = requests.get(url=src, stream=True)
        with open(page_name + "/" + src_name,'wb') as f:
            for chunk in img.iter_content(chunk_size=1024):
                f.write(chunk)
 
if __name__ == '__main__':
    title,value = get_url("lyshark",1,2)
    for item in range(0,len(value)):
        print(title[item])
        down_page(title[item].replace(" ",""),value[item])

Selenium 自动化测试库的使用:

 
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="gbk">
    <title>Selenium Test</title>
</head>
<body>
    <div class="acount" id="aid">
        <a class="mnav" href="https://news.baidu.com" name="trnews">新闻</a>
        <a class="mnav" href="https://lyshark.cnblogs.com" name="myblog">我的博客</a>
        <a class="mnav" href="https://github.com/lyshark" name="mygit">GitHub</a>
    </div>
    <form id="forms" class="fms" name="submit_form" action="index.html">
        <span class="soutu-btn"></span>
        <p>用户: <input id="user" class="s_ipt" name="wd" value="" maxlength="255" autocomplete="off"></p>
        <p>密码: <input id="pass" class="s_ipt" name="wd" value="" maxlength="255" autocomplete="off"></p>
        <input type="submit" value="提交" />
    </form>
    <p name="p1" > hello lyshark p1</p>
    <p name="p2" > hello lyshark p2</p>
</body>
</html>

通过简单的浏览文件并实现简单的定位.

 
# 驱动下载地址: http://chromedriver.storage.googleapis.com/index.html
from selenium import webdriver
 
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
 
# 常用的定位变量参数如下所示.
driver.get("http://lyshark.com")
print("当前URL: {}".format(driver.current_url))
print("当前标题: {}".format(driver.title))
print("网页代码: {}".format(driver.page_source))
 
# 基本的 find_element 标签查找定位方式
print(driver.find_element_by_id("user"))          # 通过ID来查找元素
print(driver.find_element_by_name("p1").text)     # 通过name属性来定位
print(driver.find_element_by_class_name("s_ipt")) # 通过类名来定位
 
# 通过xpath定位,xpath定位有N种写法,这里列几个常用写法
print(driver.find_element_by_xpath("//form[@class='fms']//input[@id='user']"))
print(driver.find_element_by_xpath("//p[@name='p1']"))
print(driver.find_element_by_xpath("//html/body/form/p/input"))
print(driver.find_elements_by_css_selector(".fms #user"))
 
# 定位a标签中的关键字.
print(driver.find_element_by_link_text("新闻"))
print(driver.find_element_by_partial_link_text("我"))

简单实现多个标签之间互相切换

 
# -*- coding:utf-8 -*-
from selenium import webdriver
import time
 
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
driver.get("https://www.baidu.com")
 
driver.find_element_by_id("kw").send_keys("lyshark")  # 发送给id=kw的编辑框,搜索关键字 lyshark
driver.find_element_by_id("su").click()               # 点击搜索按钮,百度一下的ID是su
time.sleep(1)
# xpath 语法 寻找 div id是1里面的 a标签取出标签中的 contains text()
driver.find_element_by_xpath("//div[@id='1']//a[contains(text(),'-')]").click()
time.sleep(1)
 
handle = driver.current_window_handle   # 获取当前窗口句柄
handle_all = driver.window_handles      # 获取当前所有开启窗口的句柄
print(handle_all)
driver.switch_to.window(handle_all[0])   # 切换到第一个窗口中
time.sleep(1)
driver.find_element_by_id("kw").clear()  # 接着清空搜索框中的内容

通过xpath定位标签并自动输入内容,发送登录请求到后端,写法如下.

 
from selenium import webdriver
 
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
 
driver.get("http://lyshark.com")
 
# 通过xpath语法定位到用户名的标签上并且自动输入lyshark这个用户名
driver.find_element_by_xpath("//form[@class='fms']/p//input[@id='user']").send_keys("lyshark")
 
# 通过xpath语法定位到密码的标签上清空默认值,然后输入123123密码
driver.find_element_by_xpath("//form[@class='fms']/p//input[@id='pass']").clear()
driver.find_element_by_xpath("//form[@class='fms']/p//input[@id='pass']").send_keys("123123")
 
# 提交这个请求,默认有两种提交方式一种是 click() 一种是submit()
driver.find_element_by_xpath("//form[@class='fms']/input[@type='submit']").click()

通过类库实现模拟键盘鼠标操作记录.

 
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
 
WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
driver = webdriver.Chrome(executable_path=WebPath)
driver.set_window_size(1024,768)
 
driver.get("https://www.baidu.com")
 
# ------------------------------------------------------------------------
# ActionChains 类提供了鼠标操作的常用方法，鼠标事件的常用函数说明
# perform()：        鼠标悬浮于标签
# context_click()：  右击
# double_click()：   双击
# drag_and_drop()：  拖动
# move_to_element()：鼠标悬停
 
# 定位到要悬停的元素
above = driver.find_element_by_link_text("更多产品")
# 对定位到的元素执行鼠标悬停操作
ActionChains(driver).move_to_element(above).perform()
 
# ------------------------------------------------------------------------
# webdriver.common.keys 类提供了键盘事件的操作,以下为常用的键盘操作：
# send_keys(Keys.BACK_SPACE) 删除键（BackSpace）
# send_keys(Keys.SPACE) 空格键(Space)
# send_keys(Keys.TAB) 制表键(Tab)
# send_keys(Keys.ESCAPE) 回退键（Esc）
# send_keys(Keys.ENTER) 回车键（Enter）
# send_keys(Keys.CONTROL,'a') 全选（Ctrl+A）
# send_keys(Keys.CONTROL,'c') 复制（Ctrl+C）
# send_keys(Keys.CONTROL,'x') 剪切（Ctrl+X）
# send_keys(Keys.CONTROL,'v') 粘贴（Ctrl+V）
# send_keys(Keys.F1) 键盘 F1
 
# 输入框输入内容
driver.find_element_by_id("kw").send_keys("seleniumm")
# 删除多输入的一个 m
driver.find_element_by_id("kw").send_keys(Keys.BACK_SPACE)
# 输入空格键+从入门到入土
driver.find_element_by_id("kw").send_keys(Keys.SPACE)
driver.find_element_by_id("kw").send_keys("从入门到入土")
 
# ctrl+a 全选输入框内容
driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'a')
# ctrl+x 剪切输入框内容
driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'x')
# ctrl+v 粘贴内容到输入框
driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'v')
 
# 通过回车键来代替单击操作
driver.find_element_by_id("su").send_keys(Keys.ENTER)

通过selenium模块配合自动按键即可实现简单的博客园自动爬行工具,用于备份非常不错.

 
from selenium import webdriver
from bs4 import BeautifulSoup
import requests,os,time,lxml
import win32api,win32con
 
header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"}
 
def get_url(name,start_page,end_page):
    value = []
    for x in range(start_page,end_page+1):
        url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x)
        response = requests.get(url,headers=header,timeout=5)
        text = str(response.content.decode("utf-8"))
        bs = BeautifulSoup(text,"lxml")
        ret = bs.select('div[class="day"] div[class="postTitle"] a')
        for item in range(0,10):
            y = ret[item].get('href').replace("\n","")
            value.append(y)
            print("[+] 爬行地址: {} ".format(y))
    return value
 
if __name__ == "__main__":
    value = get_url("csnd",1,2)
    WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
    driver = webdriver.Chrome(executable_path=WebPath)
    driver.set_window_size(1024,768)
 
    for item in range(0,len(value)):
        print("[-] ---> 开始保存:{}".format(value[item]))
        driver.get(value[item])
        # 按下ctrl+s
        win32api.keybd_event(0x11, 0, 0, 0)
        win32api.keybd_event(0x53, 0, 0, 0)
        win32api.keybd_event(0x53, 0, win32con.KEYEVENTF_KEYUP, 0)
        win32api.keybd_event(0x11, 0, win32con.KEYEVENTF_KEYUP, 0)
        # 按下回车
        time.sleep(1)
        win32api.keybd_event(0x0D, 0, 0, 0)
        win32api.keybd_event(0x0D, 0, win32con.KEYEVENTF_KEYUP, 0)

html parser

 
#定义一个MyParser继承自HTMLParser
class MyParser(HTMLParser):
    re=[]#放置结果
    flg=0#标志，用以标记是否找到我们需要的标签
    def handle_starttag(self, tag, attrs):
        if tag=='h3':#目标标签
            for attr in attrs:
                if attr[0]=='class' and attr[1]=='tb-main-title':#目标标签具有的属性
                    self.flg=1#符合条件则将标志设置为1
                    break
        else:
            pass
  
    def handle_data(self, data):
        if self.flg==1:
            self.re.append(data.strip())#如果标志为我们需要的标志，则将数据添加到列表中
            self.flg=0#重置标志，进行下次迭代
        else:
            pass
 
 
my=MyParser()
my.feed(html)

	>>> import urllib.request
	>>>
	>>> res=urllib.request.urlopen("https://www.baidu.com")
	>>> print(res.read().decode("utf-8"))

	>>> f=open("./test.html","wb") #保存在本地
	>>> f.write(res.read())
	>>> f.close()

	>>> import urllib.parse
	>>> import urllib.request
	>>>
	>>> data=bytes(urllib.parse.urlencode({"hello":"lyshark"}),encoding="utf-8")
	>>> print(data)
	>>> response = urllib.request.urlopen('http://www.baidu.com/post',data=data)
	>>> print(response.read())

	import urllib.request

	response = urllib.request.urlopen('http://www.baidu.com', timeout=1)
	print(response.read())

	>>> import urllib.request
	>>>
	>>> res=urllib.request.urlopen("https://www.python.org")
	>>> print(type(res))
	<class 'http.client.HTTPResponse'>
	>>>
	>>> res.status
	>>> res.getheaders()
	>>> res.getheader("server")

	from urllib import request,parse

	url = 'http://www.baidu.com'
	headers = {
	'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
	'Host': 'mkdirs.org'
	}
	dict = {
	'name': 'LyShark'
	}
	data = bytes(parse.urlencode(dict), encoding='utf8')
	req = request.Request(url=url, data=data, headers=headers, method='POST')
	response = request.urlopen(req)
	print(response.read().decode('utf-8'))

	import re

	def Get_Url(target,start,ends):
	urls=[]
	for i in range(start,ends):
	url = target+"/"+str(i)
	urls.append(url)
	return urls

	if __name__ == "__main__":
	url = Get_Url("https://www.mzitu.com/214261",1,10)
	print(url)

	import re
	import requests

	head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

	if __name__ == "__main__":
	ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1)
	all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S)
	print(all_pic_link)

	import re
	import urllib.request

	def open_url(url):
	ret = urllib.request.Request(url)
	ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36')
	page = urllib.request.urlopen(ret)
	html =page.read().decode("utf-8")
	return html

	def get_img(html):
	ret = re.findall('<img src="([^"]+\.jpg)"',html)
	for each in ret:
	filename = each.split("/")[-1]
	print("完整路径:",each)
	print("文件名称:",filename)
	urllib.request.urlretrieve(each,filename,None)

	if __name__ == '__main__':
	url = open_url("https://www.mzitu.com/210402")
	get_img(url)

	import re
	import requests
	from bs4 import BeautifulSoup

	head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

	def Get_CVE(url):
	new_cve = []
	ret = requests.get(url=url, headers=head, timeout=3)
	bs = BeautifulSoup(ret.text, 'html.parser')
	for i in bs.find_all('a'):
	href = i.get('href')
	new_cve.append(href)
	return(new_cve)

	def Get_Number(list):
	new = []
	for i in list:
	temp = re.findall("[0-9]{1,}-.*", str(i))
	new.append("CVE-{}".format(temp))
	return new

	if __name__ == "__main__":
	url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html"
	cve = Get_CVE(url)
	number = Get_Number(cve)
	for i in number:
	print("今日份的漏洞:",i)

	import re
	import requests

	head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
	ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3)
	data = re.findall('<td>.*</td>', ret.text)
	sum =0
	for i in range(0,20):
	IP = data[sum].replace("<td>","").replace("</td>","")
	Port = data[sum+1].replace("<td>","").replace("</td>","")
	Type = data[sum+2].replace("<td>","").replace("</td>","")
	times = data[sum+3].replace("<td>","").replace("</td>","")
	year = data[sum+4].replace("<td>","").replace("</td>","")
	print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year))
	sum = sum+5

	from bs4 import BeautifulSoup
	import requests
	import html5lib

	head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
	ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3)
	text = str(ret.content.decode('utf-8'))

	bs = BeautifulSoup(text,"html5lib")
	ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]')
	for i in ret:
	#house = i.get_text() # 提取出文中的所有字符串以及其格式
	house = list(i.stripped_strings) # 提取出字符串并以列表的形式返回
	print(house)

	import re
	from time import sleep
	import requests

	head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
	proxy = { "http":"http://127.0.0.1:9999" }
	# 无密码写法："http": "http://ip:端口号"
	# 有密码写法："https": "https://username:password@ip:端口号"

	file = open("save.log","r",encoding="utf-8")
	for i in file.readlines():
	data = "".join(i.split('\n')) # 去除空格
	proxy.update(http=data) # 更新proxy中的数据为当前行
	ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy)
	if ret.status_code == 200:
	print("代理:{} 访问完成".format(proxy["http"]))
	else:
	print("代理:{} 不在线,失败".format(proxy["http"]))
	sleep(1)

	import requests

	head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
	proxy = { "http":"http://117.69.200.46:9999" }

	url = "https://nmap.org/dist/nmap-7.80-win32.zip"

	ret = requests.get(url=url, headers=head,stream=True,proxies=proxy)
	fp = open("nmap.zip","wb")

	for chunk in ret.iter_content(chunk_size=4096):
	if chunk:
	print("本次保存长度:{} ".format(len(chunk)))
	fp.write(chunk)

	import requests
	import json

	def GetSubDomain(domain):
	url = "http://ce.baidu.com/index/getRelatedSites?site_address={}".format(domain)
	ret = requests.get(url=url)
	obj = json.loads(ret.text)
	list = obj.get("data")
	print("子域名个数：{}".format(len(list)))

	fp = open("domain.log","w")
	for item in list:
	fp.write(item.get("domain"))
	fp.write("\n")
	print(item)
	fp.close()

	GetSubDomain("qq.com")

	from bs4 import BeautifulSoup
	import requests,os

	header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"}

	def get_url(name,start_page,end_page):
	title = []
	value = []
	for x in range(start_page,end_page+1):
	url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x)
	response = requests.get(url,headers=header,timeout=5)
	text = str(response.content.decode("utf-8"))
	bs = BeautifulSoup(text,"lxml")
	ret = bs.select('div[class="day"] div[class="postTitle"] a')
	for item in range(0,10):
	x = ret[item].get_text().replace("\n","")
	y = ret[item].get('href').replace("\n","")
	title.append(x)
	value.append(y)
	print("[+] 文章路径: ---> 地址: {} ---> 标题: {}".format(y,x))
	return title,value

	def down_page(page_name,url):
	params = { "enc": "utf-8" }
	response = requests.get(url=url,params=params,headers=header)
	#print(response.encoding) # 打印出所请求页面返回的编码方式
	#print(response.apparent_encoding) # 通过内容分析出的编码方式，这里是urf-8
	content = response.text.encode(response.encoding).decode(response.apparent_encoding)

	os.system("mkdir {}".format(page_name))
	# 下载页面并放入相应目录下
	with open(page_name + "/" + page_name+".html", 'w', encoding='utf-8') as f:
	f.write(content)

	# 下载图片相关内容
	bs = BeautifulSoup(content, "lxml")
	ret = bs.select('div[id="cnblogs_post_body"] div[class="left-9-code"] img')
	for item in range(0,len(ret)):
	src = ret[item].get("src")
	src_name = src.split("/")[-1]
	print("[+] ---> 正在准备下载图片: {} ---> 地址: {}".format(src_name,src))
	img = requests.get(url=src, stream=True)
	with open(page_name + "/" + src_name,'wb') as f:
	for chunk in img.iter_content(chunk_size=1024):
	f.write(chunk)

	if __name__ == '__main__':
	title,value = get_url("lyshark",1,2)
	for item in range(0,len(value)):
	print(title[item])
	down_page(title[item].replace(" ",""),value[item])

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="gbk">
	<title>Selenium Test</title>
	</head>
	<body>
	<div class="acount" id="aid">
	<a class="mnav" href="https://news.baidu.com" name="trnews">新闻</a>
	<a class="mnav" href="https://lyshark.cnblogs.com" name="myblog">我的博客</a>
	<a class="mnav" href="https://github.com/lyshark" name="mygit">GitHub</a>
	</div>
	<form id="forms" class="fms" name="submit_form" action="index.html">
	<span class="soutu-btn"></span>
	<p>用户: <input id="user" class="s_ipt" name="wd" value="" maxlength="255" autocomplete="off"></p>
	<p>密码: <input id="pass" class="s_ipt" name="wd" value="" maxlength="255" autocomplete="off"></p>
	<input type="submit" value="提交" />
	</form>
	<p name="p1" > hello lyshark p1</p>
	<p name="p2" > hello lyshark p2</p>
	</body>
	</html>

	# 驱动下载地址: http://chromedriver.storage.googleapis.com/index.html
	from selenium import webdriver

	WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
	driver = webdriver.Chrome(executable_path=WebPath)
	driver.set_window_size(1024,768)

	# 常用的定位变量参数如下所示.
	driver.get("http://lyshark.com")
	print("当前URL: {}".format(driver.current_url))
	print("当前标题: {}".format(driver.title))
	print("网页代码: {}".format(driver.page_source))

	# 基本的 find_element 标签查找定位方式
	print(driver.find_element_by_id("user")) # 通过ID来查找元素
	print(driver.find_element_by_name("p1").text) # 通过name属性来定位
	print(driver.find_element_by_class_name("s_ipt")) # 通过类名来定位

	# 通过xpath定位,xpath定位有N种写法,这里列几个常用写法
	print(driver.find_element_by_xpath("//form[@class='fms']//input[@id='user']"))
	print(driver.find_element_by_xpath("//p[@name='p1']"))
	print(driver.find_element_by_xpath("//html/body/form/p/input"))
	print(driver.find_elements_by_css_selector(".fms #user"))

	# 定位a标签中的关键字.
	print(driver.find_element_by_link_text("新闻"))
	print(driver.find_element_by_partial_link_text("我"))

	# -- coding:utf-8 --
	from selenium import webdriver
	import time

	WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
	driver = webdriver.Chrome(executable_path=WebPath)
	driver.set_window_size(1024,768)
	driver.get("https://www.baidu.com")

	driver.find_element_by_id("kw").send_keys("lyshark") # 发送给id=kw的编辑框,搜索关键字 lyshark
	driver.find_element_by_id("su").click() # 点击搜索按钮,百度一下的ID是su
	time.sleep(1)
	# xpath 语法寻找 div id是1里面的 a标签取出标签中的 contains text()
	driver.find_element_by_xpath("//div[@id='1']//a[contains(text(),'-')]").click()
	time.sleep(1)

	handle = driver.current_window_handle # 获取当前窗口句柄
	handle_all = driver.window_handles # 获取当前所有开启窗口的句柄
	print(handle_all)
	driver.switch_to.window(handle_all[0]) # 切换到第一个窗口中
	time.sleep(1)
	driver.find_element_by_id("kw").clear() # 接着清空搜索框中的内容

	from selenium import webdriver
	from selenium.webdriver import ActionChains
	from selenium.webdriver.common.keys import Keys

	WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
	driver = webdriver.Chrome(executable_path=WebPath)
	driver.set_window_size(1024,768)

	driver.get("https://www.baidu.com")

	# ------------------------------------------------------------------------
	# ActionChains 类提供了鼠标操作的常用方法，鼠标事件的常用函数说明
	# perform()：鼠标悬浮于标签
	# context_click()：右击
	# double_click()：双击
	# drag_and_drop()：拖动
	# move_to_element()：鼠标悬停

	# 定位到要悬停的元素
	above = driver.find_element_by_link_text("更多产品")
	# 对定位到的元素执行鼠标悬停操作
	ActionChains(driver).move_to_element(above).perform()

	# ------------------------------------------------------------------------
	# webdriver.common.keys 类提供了键盘事件的操作,以下为常用的键盘操作：
	# send_keys(Keys.BACK_SPACE) 删除键（BackSpace）
	# send_keys(Keys.SPACE) 空格键(Space)
	# send_keys(Keys.TAB) 制表键(Tab)
	# send_keys(Keys.ESCAPE) 回退键（Esc）
	# send_keys(Keys.ENTER) 回车键（Enter）
	# send_keys(Keys.CONTROL,'a') 全选（Ctrl+A）
	# send_keys(Keys.CONTROL,'c') 复制（Ctrl+C）
	# send_keys(Keys.CONTROL,'x') 剪切（Ctrl+X）
	# send_keys(Keys.CONTROL,'v') 粘贴（Ctrl+V）
	# send_keys(Keys.F1) 键盘 F1

	# 输入框输入内容
	driver.find_element_by_id("kw").send_keys("seleniumm")
	# 删除多输入的一个 m
	driver.find_element_by_id("kw").send_keys(Keys.BACK_SPACE)
	# 输入空格键+从入门到入土
	driver.find_element_by_id("kw").send_keys(Keys.SPACE)
	driver.find_element_by_id("kw").send_keys("从入门到入土")

	# ctrl+a 全选输入框内容
	driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'a')
	# ctrl+x 剪切输入框内容
	driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'x')
	# ctrl+v 粘贴内容到输入框
	driver.find_element_by_id("kw").send_keys(Keys.CONTROL, 'v')

	# 通过回车键来代替单击操作
	driver.find_element_by_id("su").send_keys(Keys.ENTER)

	from selenium import webdriver
	from bs4 import BeautifulSoup
	import requests,os,time,lxml
	import win32api,win32con

	header = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"}

	def get_url(name,start_page,end_page):
	value = []
	for x in range(start_page,end_page+1):
	url = "https://www.cnblogs.com/{}/default.html?page={}".format(name,x)
	response = requests.get(url,headers=header,timeout=5)
	text = str(response.content.decode("utf-8"))
	bs = BeautifulSoup(text,"lxml")
	ret = bs.select('div[class="day"] div[class="postTitle"] a')
	for item in range(0,10):
	y = ret[item].get('href').replace("\n","")
	value.append(y)
	print("[+] 爬行地址: {} ".format(y))
	return value

	if __name__ == "__main__":
	value = get_url("csnd",1,2)
	WebPath = "C:/Users/LyShark/AppData/Local/Google/Chrome/Application/chromedriver.exe"
	driver = webdriver.Chrome(executable_path=WebPath)
	driver.set_window_size(1024,768)

	for item in range(0,len(value)):
	print("[-] ---> 开始保存:{}".format(value[item]))
	driver.get(value[item])
	# 按下ctrl+s
	win32api.keybd_event(0x11, 0, 0, 0)
	win32api.keybd_event(0x53, 0, 0, 0)
	win32api.keybd_event(0x53, 0, win32con.KEYEVENTF_KEYUP, 0)
	win32api.keybd_event(0x11, 0, win32con.KEYEVENTF_KEYUP, 0)
	# 按下回车
	time.sleep(1)
	win32api.keybd_event(0x0D, 0, 0, 0)
	win32api.keybd_event(0x0D, 0, win32con.KEYEVENTF_KEYUP, 0)

	#定义一个MyParser继承自HTMLParser
	class MyParser(HTMLParser):
	re=[]#放置结果
	flg=0#标志，用以标记是否找到我们需要的标签
	def handle_starttag(self, tag, attrs):
	if tag=='h3':#目标标签
	for attr in attrs:
	if attr[0]=='class' and attr[1]=='tb-main-title':#目标标签具有的属性
	self.flg=1#符合条件则将标志设置为1
	break
	else:
	pass

	def handle_data(self, data):
	if self.flg==1:
	self.re.append(data.strip())#如果标志为我们需要的标志，则将数据添加到列表中
	self.flg=0#重置标志，进行下次迭代
	else:
	pass


	my=MyParser()
	my.feed(html)