【基础】【Python网络爬虫】【7.requests高级】cookies会话维持、异常处理(附大量案例代码)(建议收藏)
2023-12-31 16:48:48
Python网络爬虫基础
requests 高级
1. 会话维持
cookies字段形式
import requests
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
# 在请求头中传递cookies
'Acs-Token': '1687354513445_1687354530324_U0QUCd0KA/F7BYp74tXcgnoFsNtOOxo4iufv+Hk5xXqn2+frnr0XUBVQuvTA3dfcUNYPfwpE/Y/JKtFNRsVrPchR4jO1sLxlmyw0hvh3usx51exIBKNRgH4NQXBDqAt3YJadXkNDjTR67nCTZiw+RJk7dF5HUYF5tJQ2b6P7MOd74rkMTn+xiwSraonXITV1rfLX6Pljrf7BCAACg8KuPEJplI1HlqnRHpoq54OKlcGiWXm2ZWfAcq4EVmqb1nVSge61u6U85j/n7R3JJ4LA96Vw0kcKtFi5X8GAw2SHCZ1fAZREBFeYdhG6fXVEZP+e6mkHJn/yUmb3IUb+GxtEhS1alaQMFv9QQZSBx6tXbfW6ncLHgcZfcDTqoKWSe3tdX39s1qnOEoWGvwLFFe/XMszJzUdMuOhndbQPdjkofy58aIlMTJErOTeELJ+21UOigR2VuwxiD/k9oI7vmMH0UUYzjVqojZcGNU2GrWMcfto=',
# 'Cookie': "BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==",
'Host': 'fanyi.baidu.com',
'Origin': 'https://fanyi.baidu.com',
'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
data = {
'from': 'zh',
'to': 'en',
'query': '你好',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '232427.485594',
'token': '351b986af9e8a703056ff2f022cdf830',
'domain': 'common',
'ts': '1687354530307',
}
# 构建cookies字典
# cookies = {'Cookie': 'BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA=='}
# 将每个cookies片段构建键值对
cookies = {
'BAIDUID': 'A8D9EA340531252BDEF2C13A73AFA5E7:FG=1',
'BAIDUID_BFESS': 'A8D9EA340531252BDEF2C13A73AFA5E7:FG=1',
'ZFY': 'CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C',
'BIDUPSID': 'A8D9EA340531252B16551CBD43A8D395',
'PSTM': '1681976911',
'APPGUIDE_10_0_2': '1',
'REALTIME_TRANS_SWITCH': '1',
'FANYI_WORD_SWITCH': '1',
'HISTORY_SWITCH': '1',
'SOUND_SPD_SWITCH': '1',
'SOUND_PREFER_SWITCH': '1',
'H_WISE_SIDS': '131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663',
'H_PS_PSSID': '38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350',
'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598',
'delPer': '0',
'PSINO': '6',
'BA_HECTOR': 'ah2g2ka48k0g8h2h00a4842h1i95r4t1p',
'BCLID': '8504214758825411246',
'BCLID_BFESS': '8504214758825411246',
'BDSFRCVID': 'TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
'BDSFRCVID_BFESS': 'TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
'H_BDCLCKID_SF': 'tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j',
'H_BDCLCKID_SF_BFESS': 'tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j',
'Hm_lvt_64ecd82404c51e03dc91cb9e8c025574': '1687354513',
'Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574': '1687354513',
'ab_sr': '1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==',
}
# cookies关键字
response = requests.post(url=url, data=data, headers=headers, cookies=cookies)
json_data = response.json()
print(json_data)
# 请求会校验用户cookies字段
"""
cookies怎么来的?
服务器生成: 请求与响应间生成cookies的片段信息 Set-cookie
浏览器生成: 一般在我们模拟请求的时候服务器不会校验
js生成: JavaScript, 逆向分析
代码怎么维持用户的cookies状态?
"""
案例 - 某青网
cookies 保证同一个用户
"""
时间戳: 格林威治时间1970年1月1日0时0分0秒开始 到 目前 位置所消耗的时间数
秒级时间戳: 10为数字
毫秒级时间戳: 13为数字
微秒级时间戳: 16为数字
"""
import time
import requests
def get_time():
"""获取时间戳的函数"""
now_time = str(int(time.time() * 1000))
print('当前时间戳为:', now_time)
return now_time
cookies = {'seesion:', 'vnrasebgvi'}
"""请求验证码, 保存"""
img_time = get_time()
img_url = 'http://118.126.88.143:5000/login/captcha?image_code=' + img_time
print('图片地址:', img_url)
img_response = requests.get(url=img_url).content
with open('yzm.png', mode='wb') as f:
f.write(img_response)
# 手动识别验证码
img_code = input('请输入验证码:')
print('您输入的验证码为:', img_code)
"""构建登录请求"""
login_url = 'http://118.126.88.143:5000/api/private/v1/login'
json_data = {
"image_code": get_time(),
"username": "admin",
"password": "123456",
"captcha_code": img_code # 手动验证码
}
login_response = requests.post(url=login_url, json=json_data)
print(login_response.json())
# 其他网站构建请求联系, 一般是通过cookies字段
"""
一般情况下要使用requests模块维持用户状态
1. 需要在指定网站抓登录包, 用代码模拟登录 --> 难点
2. 使用session回话维持, 维持用户的登录状态抓取数据
"""
会话维持
"""
时间戳: 格林威治时间1970年1月1日0时0分0秒开始 到 目前 位置所消耗的时间数
秒级时间戳: 10为数字
毫秒级时间戳: 13为数字
微秒级时间戳: 16为数字
"""
import time
import requests
def get_time():
"""获取时间戳的函数"""
now_time = str(int(time.time() * 1000))
print('当前时间戳为:', now_time)
return now_time
cookies = {'seesion': 'vnrasebgvi'}
# 创建一个会话位置对象
session = requests.Session()
"""请求验证码, 保存"""
img_time = get_time()
img_url = 'http://118.126.88.143:5000/login/captcha?image_code=' + img_time
print('图片地址:', img_url)
# 使用回话维持对象发送请求
img_response = session.get(url=img_url, cookies=cookies).content
with open('yzm.png', mode='wb') as f:
f.write(img_response)
# 手动识别验证码
img_code = input('请输入验证码:')
print('您输入的验证码为:', img_code)
"""构建登录请求"""
login_url = 'http://118.126.88.143:5000/api/private/v1/login'
json_data = {
"image_code": img_time,
"username": "admin",
"password": "123456",
"captcha_code": img_code # 手动验证码
}
# 使用回话维持对象维持用户的登录状态
login_response = session.post(url=login_url, json=json_data)
print(login_response.cookies.get_dict())
print(login_response.json())
# 其他网站构建请求联系, 一般是通过cookies字段
案例 - 太平洋亲子网
无会话维持
"""
官网地址: https://www.pcbaby.com.cn/
个人中心页面地址: https://my.pcbaby.com.cn/user/adminIndex.jsp
登录页面地址: http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp
账号: mb51222353
密码: 123456..
"""
import requests
my_home_url = 'https://my.pcbaby.com.cn/user/adminIndex.jsp'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(url=my_home_url, headers=headers)
print(response.text)
print(response.status_code)
with open('my_home.html', mode='w', encoding='gb2312') as f:
f.write(response.text)
"""
当我们没有登录的时候, 请求个人中心页面的数据会自动的重定向
"""
有会话维持
"""
官网地址: https://www.pcbaby.com.cn/
个人中心页面地址: https://my.pcbaby.com.cn/user/adminIndex.jsp
登录页面地址: http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp
账号: mb51222353
密码: 123456..
"""
import requests
# def get_proxy():
# url = 'http://zltiqu.pyhttp.taolop.com/getip?count=1&neek=13873&type=2&yys=0&port=1&sb=&mr=2&sep=0'
# response = requests.get(url=url)
# json_data = response.json()
# # print(json_data)
#
# ip_port = json_data['data'][0]['ip'] + ":" + str(json_data['data'][0]['port'])
# # print(ip_port)
#
# proxies = {
# "http": "http://" + ip_port,
# "https": "http://" + ip_port,
# }
# return proxies
#
# proxies = get_proxy()
# print('获取到的代理:', proxies)
session = requests.Session()
headers = {
# 'Cookie': 'u=35glx62u; c=36glwvep; pcsuv=1681996661754.a.77080874; slidecaptcha=bec8e1757a0944a7b880d4dea214a8d8; cmu=mb51222353; pcsuv2=1688389738448.a.99790086; pcuvdata=lastAccessTime=1688389738448|visits=3; channel=3794',
'Host': 'passport3.pcbaby.com.cn',
'Origin': 'http://my.pcbaby.com.cn',
'Referer': 'http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
"""模拟登录"""
login_url = 'http://passport3.pcbaby.com.cn/passport3/passport/login_ajax_do_new.jsp?req_enc=UTF-8'
data = {
'return': 'https://my.pcbaby.com.cn/user/adminIndex.jsp',
'bindUrl': 'https://my.pcbaby.com.cn/passport/bindMobile.jsp?return=https://my.pcbaby.com.cn/user/adminIndex.jsp',
'username': 'mb51222353',
'password': '123456..',
'auto_login': '30',
'checkbox': 'on',
}
login_response = session.post(url=login_url, data=data, headers=headers)
print(login_response.json())
print(login_response.status_code)
"""
如果直接用requests请求, 那么上下的这两次请求是没有半毛钱关系的 √
需要用回话维持
"""
"""请求个人中心页面"""
my_home_url = 'https://my.pcbaby.com.cn/user/adminIndex.jsp'
response = session.get(url=my_home_url)
print(response.text)
print(response.status_code)
with open('my_home_2.html', mode='w', encoding='gb2312') as f:
f.write(response.text)
"""
当我们没有登录的时候, 请求个人中心页面的数据会自动的重定向
"""
2. 异常处理
乱码错误
import requests
response = requests.get('http://www.pcbaby.com.cn/')
# # response.encoding = response.apparent_encoding 自动识别响应体编码
# response.encoding = response.apparent_encoding
response.encoding = 'gb2312' # 手动指定编码识别
# gbk utf-8
# gb2312 utf-8-sig
# gb18030
html_str = response.text
print(html_str)
请求头参数错误
import requests
headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36'
}
response = requests.get('http://www.shuquge.com/txt/8659/index.html',
headers=headers)
response.encoding = response.apparent_encoding
html = response.text
print(html)
"""
请求头字段中, 空格不能多不能少
"""
请求不到数据
import requests
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
# 在请求头中传递cookies
'Acs-Token': '1687354513445_1687354530324_U0QUCd0KA/F7BYp74tXcgnoFsNtOOxo4iufv+Hk5xXqn2+frnr0XUBVQuvTA3dfcUNYPfwpE/Y/JKtFNRsVrPchR4jO1sLxlmyw0hvh3usx51exIBKNRgH4NQXBDqAt3YJadXkNDjTR67nCTZiw+RJk7dF5HUYF5tJQ2b6P7MOd74rkMTn+xiwSraonXITV1rfLX6Pljrf7BCAACg8KuPEJplI1HlqnRHpoq54OKlcGiWXm2ZWfAcq4EVmqb1nVSge61u6U85j/n7R3JJ4LA96Vw0kcKtFi5X8GAw2SHCZ1fAZREBFeYdhG6fXVEZP+e6mkHJn/yUmb3IUb+GxtEhS1alaQMFv9QQZSBx6tXbfW6ncLHgcZfcDTqoKWSe3tdX39s1qnOEoWGvwLFFe/XMszJzUdMuOhndbQPdjkofy58aIlMTJErOTeELJ+21UOigR2VuwxiD/k9oI7vmMH0UUYzjVqojZcGNU2GrWMcfto=',
'Cookie': "BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==",
'Host': 'fanyi.baidu.com',
'Origin': 'https://fanyi.baidu.com',
'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
data = {
'from': 'zh',
'to': 'en',
'query': '你好',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '232427.485594',
'token': '351b986af9e8a703056ff2f022cdf830',
'domain': 'common',
'ts': '1687354530307',
}
response = requests.post(url=url, data=data, headers=headers)
json_data = response.json()
print(json_data)
"""
当我们请求不到数据的时候, 可通过一下方式考虑
是不是请求头反扒了?
params 参数有没有加密
data参数 有没有加密
是不是服务器问题
"""
目标计算机积极拒绝
import requests
proxy_response = requests.get('http://127.0.0.1:5010/get')
proxy = proxy_response.json()
print(proxy)
"""
如果目标计算机积极拒绝 --> requests.exceptions.ConnectionError:
此问题是你在服务器没有权限
或者服务器没有服务程序
"""
连接超时
import requests
proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)
异常重试
import requests
try:
proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)
except:
proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)
案例 - 异常重试
import parsel
import requests
def get_one_chapter(url, times=3): # times 控制异常重试的次数, 超过这个次数任然会报错
try:
response = requests.get(url=url, headers=headers, timeout=0.1)
html_data = response.text
# print(html_data)
selector = parsel.Selector(html_data)
title = selector.css('h1.wap_none::text').re(' (.*)')[0] # 标题
contend = selector.css('#chaptercontent::text').getall() # 小说数据
contend = [i.replace('\u3000\u3000', '') for i in contend]
contend = '\n'.join(contend)
file_path = '小说\\' + str(count) + title + '.txt'
with open(file_path, mode='w', encoding='utf-8') as f:
f.write(contend)
print('保存完成:', file_path)
except Exception as e:
print(e)
if times >= 1: # 可以控制异常重试的次数
get_one_chapter(url, times=times - 1) # 利用函数的递归的特性, 异常重试, 一直重试, 知道成功为止
print('*' * 100)
if __name__ == '__main__':
count = 1 # 全局变量, 用于小说章节的名字
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
url = 'https://www.bqg70.com/book/1031/'
response = requests.get(url=url, headers=headers)
selector = parsel.Selector(response.text)
dds = selector.css('.listmain>dl dd')
for dd in dds:
title = dd.css('a::text').get()
link_url = dd.css('a::attr(href)').get()
all_url = 'https://www.bqg70.com' + link_url
# print(all_url)
if '展开全部章节' in title:
continue
get_one_chapter(all_url)
count += 1
案例 - 爬取我的钢铁网
import csv
import requests
url = 'https://index.mysteel.com/api/pricetrend/getChartMultiCity.htm'
params = {
'catalog': '%E8%A7%92%E9%92%A2_:_%E8%A7%92%E9%92%A2',
'city': '%E9%95%BF%E6%B2%99',
'spec': 'Q235B%2050*50*5_:_Q235B_50*50*5',
'startTime': '2023-03-01',
'endTime': '2023-04-01',
'callback': 'json',
'v': '1688557188999',
}
response = requests.get(url=url, params=params)
json_data = response.json()
print(json_data)
city_name = json_data['data'][0]['lineName']
print(city_name)
with open('data.csv', mode='a', encoding='utf-8', newline='') as f:
csv_write = csv.DictWriter(f, fieldnames=['city_name', 'date', 'value'])
csv_write.writeheader()
for i in json_data['data'][0]['dateValueMap']:
i['city_name'] = city_name
csv_write.writerow(i) # 写入数据
print(requests.utils.unquote('%E9%95%BF%E6%B2%99'))
print(requests.utils.unquote('%E8%A7%92%E9%92%A2_'))
print(requests.utils.unquote('%20'))
文章来源:https://blog.csdn.net/weixin_43612602/article/details/135316904
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!