【基础】【Python网络爬虫】【7.requests高级】cookies会话维持、异常处理(附大量案例代码)(建议收藏)

2023-12-31 16:48:48

requests 高级

1. 会话维持

cookies字段形式

import requests

url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
    # 在请求头中传递cookies
    'Acs-Token': '1687354513445_1687354530324_U0QUCd0KA/F7BYp74tXcgnoFsNtOOxo4iufv+Hk5xXqn2+frnr0XUBVQuvTA3dfcUNYPfwpE/Y/JKtFNRsVrPchR4jO1sLxlmyw0hvh3usx51exIBKNRgH4NQXBDqAt3YJadXkNDjTR67nCTZiw+RJk7dF5HUYF5tJQ2b6P7MOd74rkMTn+xiwSraonXITV1rfLX6Pljrf7BCAACg8KuPEJplI1HlqnRHpoq54OKlcGiWXm2ZWfAcq4EVmqb1nVSge61u6U85j/n7R3JJ4LA96Vw0kcKtFi5X8GAw2SHCZ1fAZREBFeYdhG6fXVEZP+e6mkHJn/yUmb3IUb+GxtEhS1alaQMFv9QQZSBx6tXbfW6ncLHgcZfcDTqoKWSe3tdX39s1qnOEoWGvwLFFe/XMszJzUdMuOhndbQPdjkofy58aIlMTJErOTeELJ+21UOigR2VuwxiD/k9oI7vmMH0UUYzjVqojZcGNU2GrWMcfto=',
    # 'Cookie': "BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==",
    'Host': 'fanyi.baidu.com',
    'Origin': 'https://fanyi.baidu.com',
    'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

data = {
    'from': 'zh',
    'to': 'en',
    'query': '你好',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '232427.485594',
    'token': '351b986af9e8a703056ff2f022cdf830',
    'domain': 'common',
    'ts': '1687354530307',
}

# 构建cookies字典
# cookies = {'Cookie': 'BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA=='}


# 将每个cookies片段构建键值对
cookies = {
    'BAIDUID': 'A8D9EA340531252BDEF2C13A73AFA5E7:FG=1',
    'BAIDUID_BFESS': 'A8D9EA340531252BDEF2C13A73AFA5E7:FG=1',
    'ZFY': 'CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C',
    'BIDUPSID': 'A8D9EA340531252B16551CBD43A8D395',
    'PSTM': '1681976911',
    'APPGUIDE_10_0_2': '1',
    'REALTIME_TRANS_SWITCH': '1',
    'FANYI_WORD_SWITCH': '1',
    'HISTORY_SWITCH': '1',
    'SOUND_SPD_SWITCH': '1',
    'SOUND_PREFER_SWITCH': '1',
    'H_WISE_SIDS': '131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663',
    'H_PS_PSSID': '38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350',
    'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598',
    'delPer': '0',
    'PSINO': '6',
    'BA_HECTOR': 'ah2g2ka48k0g8h2h00a4842h1i95r4t1p',
    'BCLID': '8504214758825411246',
    'BCLID_BFESS': '8504214758825411246',
    'BDSFRCVID': 'TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
    'BDSFRCVID_BFESS': 'TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
    'H_BDCLCKID_SF': 'tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j',
    'H_BDCLCKID_SF_BFESS': 'tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j',
    'Hm_lvt_64ecd82404c51e03dc91cb9e8c025574': '1687354513',
    'Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574': '1687354513',
    'ab_sr': '1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==',

}
# cookies关键字
response = requests.post(url=url, data=data, headers=headers, cookies=cookies)
json_data = response.json()
print(json_data)

# 请求会校验用户cookies字段
"""
cookies怎么来的?
    服务器生成: 请求与响应间生成cookies的片段信息  Set-cookie
    浏览器生成: 一般在我们模拟请求的时候服务器不会校验
    js生成: JavaScript, 逆向分析
    
代码怎么维持用户的cookies状态?
"""
案例 - 某青网
cookies 保证同一个用户
"""
时间戳: 格林威治时间1970年1月1日0时0分0秒开始 到 目前 位置所消耗的时间数
    秒级时间戳: 10为数字
    毫秒级时间戳: 13为数字
    微秒级时间戳: 16为数字
"""
import time
import requests

def get_time():
    """获取时间戳的函数"""
    now_time = str(int(time.time() * 1000))
    print('当前时间戳为:', now_time)
    return now_time

cookies = {'seesion:', 'vnrasebgvi'}

"""请求验证码, 保存"""
img_time = get_time()
img_url = 'http://118.126.88.143:5000/login/captcha?image_code=' + img_time
print('图片地址:', img_url)

img_response = requests.get(url=img_url).content
with open('yzm.png', mode='wb') as f:
    f.write(img_response)

# 手动识别验证码
img_code = input('请输入验证码:')
print('您输入的验证码为:', img_code)

"""构建登录请求"""
login_url = 'http://118.126.88.143:5000/api/private/v1/login'
json_data = {
    "image_code": get_time(),
    "username": "admin",
    "password": "123456",
    "captcha_code": img_code  # 手动验证码
}

login_response = requests.post(url=login_url, json=json_data)
print(login_response.json())

# 其他网站构建请求联系, 一般是通过cookies字段
"""
一般情况下要使用requests模块维持用户状态
    1. 需要在指定网站抓登录包, 用代码模拟登录 --> 难点
    2. 使用session回话维持, 维持用户的登录状态抓取数据
"""
会话维持
"""
时间戳: 格林威治时间1970年1月1日0时0分0秒开始 到 目前 位置所消耗的时间数
    秒级时间戳: 10为数字
    毫秒级时间戳: 13为数字
    微秒级时间戳: 16为数字
"""
import time
import requests

def get_time():
    """获取时间戳的函数"""
    now_time = str(int(time.time() * 1000))
    print('当前时间戳为:', now_time)
    return now_time

cookies = {'seesion': 'vnrasebgvi'}

# 创建一个会话位置对象
session = requests.Session()

"""请求验证码, 保存"""
img_time = get_time()
img_url = 'http://118.126.88.143:5000/login/captcha?image_code=' + img_time
print('图片地址:', img_url)

# 使用回话维持对象发送请求
img_response = session.get(url=img_url, cookies=cookies).content
with open('yzm.png', mode='wb') as f:
    f.write(img_response)

# 手动识别验证码
img_code = input('请输入验证码:')
print('您输入的验证码为:', img_code)

"""构建登录请求"""
login_url = 'http://118.126.88.143:5000/api/private/v1/login'
json_data = {
    "image_code": img_time,
    "username": "admin",
    "password": "123456",
    "captcha_code": img_code  # 手动验证码
}

# 使用回话维持对象维持用户的登录状态
login_response = session.post(url=login_url, json=json_data)
print(login_response.cookies.get_dict())
print(login_response.json())

# 其他网站构建请求联系, 一般是通过cookies字段
案例 - 太平洋亲子网
无会话维持
"""
官网地址: https://www.pcbaby.com.cn/
个人中心页面地址: https://my.pcbaby.com.cn/user/adminIndex.jsp
登录页面地址: http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp

账号: mb51222353
密码: 123456..
"""
import requests

my_home_url = 'https://my.pcbaby.com.cn/user/adminIndex.jsp'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(url=my_home_url, headers=headers)
print(response.text)
print(response.status_code)

with open('my_home.html', mode='w', encoding='gb2312') as f:
    f.write(response.text)

"""
当我们没有登录的时候, 请求个人中心页面的数据会自动的重定向
"""
有会话维持
"""
官网地址: https://www.pcbaby.com.cn/
个人中心页面地址: https://my.pcbaby.com.cn/user/adminIndex.jsp
登录页面地址: http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp

账号: mb51222353
密码: 123456..
"""
import requests

# def get_proxy():
#     url = 'http://zltiqu.pyhttp.taolop.com/getip?count=1&neek=13873&type=2&yys=0&port=1&sb=&mr=2&sep=0'
#     response = requests.get(url=url)
#     json_data = response.json()
#     # print(json_data)
#
#     ip_port = json_data['data'][0]['ip'] + ":" + str(json_data['data'][0]['port'])
#     # print(ip_port)
#
#     proxies = {
#         "http": "http://" + ip_port,
#         "https": "http://"  + ip_port,
#     }
#     return proxies
#
# proxies = get_proxy()
# print('获取到的代理:', proxies)


session = requests.Session()

headers = {
    # 'Cookie': 'u=35glx62u; c=36glwvep; pcsuv=1681996661754.a.77080874; slidecaptcha=bec8e1757a0944a7b880d4dea214a8d8; cmu=mb51222353; pcsuv2=1688389738448.a.99790086; pcuvdata=lastAccessTime=1688389738448|visits=3; channel=3794',
    'Host': 'passport3.pcbaby.com.cn',
    'Origin': 'http://my.pcbaby.com.cn',
    'Referer': 'http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}

"""模拟登录"""
login_url = 'http://passport3.pcbaby.com.cn/passport3/passport/login_ajax_do_new.jsp?req_enc=UTF-8'
data = {
    'return': 'https://my.pcbaby.com.cn/user/adminIndex.jsp',
    'bindUrl': 'https://my.pcbaby.com.cn/passport/bindMobile.jsp?return=https://my.pcbaby.com.cn/user/adminIndex.jsp',
    'username': 'mb51222353',
    'password': '123456..',
    'auto_login': '30',
    'checkbox': 'on',
}

login_response = session.post(url=login_url, data=data, headers=headers)
print(login_response.json())
print(login_response.status_code)

"""
如果直接用requests请求, 那么上下的这两次请求是没有半毛钱关系的 √
需要用回话维持
"""
"""请求个人中心页面"""
my_home_url = 'https://my.pcbaby.com.cn/user/adminIndex.jsp'
response = session.get(url=my_home_url)
print(response.text)
print(response.status_code)

with open('my_home_2.html', mode='w', encoding='gb2312') as f:
    f.write(response.text)

"""
当我们没有登录的时候, 请求个人中心页面的数据会自动的重定向
"""

2. 异常处理

乱码错误

import requests

response = requests.get('http://www.pcbaby.com.cn/')
# # response.encoding = response.apparent_encoding  自动识别响应体编码
# response.encoding = response.apparent_encoding
response.encoding = 'gb2312'  # 手动指定编码识别

# gbk      utf-8
# gb2312   utf-8-sig
# gb18030
html_str = response.text
print(html_str)

请求头参数错误

import requests

headers = {
    'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36'
}
response = requests.get('http://www.shuquge.com/txt/8659/index.html',
                        headers=headers)
response.encoding = response.apparent_encoding
html = response.text
print(html)

"""
请求头字段中, 空格不能多不能少
"""

请求不到数据

import requests
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
    # 在请求头中传递cookies
    'Acs-Token': '1687354513445_1687354530324_U0QUCd0KA/F7BYp74tXcgnoFsNtOOxo4iufv+Hk5xXqn2+frnr0XUBVQuvTA3dfcUNYPfwpE/Y/JKtFNRsVrPchR4jO1sLxlmyw0hvh3usx51exIBKNRgH4NQXBDqAt3YJadXkNDjTR67nCTZiw+RJk7dF5HUYF5tJQ2b6P7MOd74rkMTn+xiwSraonXITV1rfLX6Pljrf7BCAACg8KuPEJplI1HlqnRHpoq54OKlcGiWXm2ZWfAcq4EVmqb1nVSge61u6U85j/n7R3JJ4LA96Vw0kcKtFi5X8GAw2SHCZ1fAZREBFeYdhG6fXVEZP+e6mkHJn/yUmb3IUb+GxtEhS1alaQMFv9QQZSBx6tXbfW6ncLHgcZfcDTqoKWSe3tdX39s1qnOEoWGvwLFFe/XMszJzUdMuOhndbQPdjkofy58aIlMTJErOTeELJ+21UOigR2VuwxiD/k9oI7vmMH0UUYzjVqojZcGNU2GrWMcfto=',
    'Cookie': "BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==",
    'Host': 'fanyi.baidu.com',
    'Origin': 'https://fanyi.baidu.com',
    'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

data = {
    'from': 'zh',
    'to': 'en',
    'query': '你好',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '232427.485594',
    'token': '351b986af9e8a703056ff2f022cdf830',
    'domain': 'common',
    'ts': '1687354530307',
}

response = requests.post(url=url, data=data, headers=headers)
json_data = response.json()
print(json_data)

"""
当我们请求不到数据的时候, 可通过一下方式考虑
    是不是请求头反扒了?
    params 参数有没有加密
    data参数 有没有加密
    是不是服务器问题
"""

目标计算机积极拒绝

import requests

proxy_response = requests.get('http://127.0.0.1:5010/get')
proxy = proxy_response.json()
print(proxy)

"""
如果目标计算机积极拒绝 --> requests.exceptions.ConnectionError:
    此问题是你在服务器没有权限
    或者服务器没有服务程序
"""

连接超时

import requests

proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)

异常重试

import requests

try:
    proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
    proxy = proxy_response.json()
    print(proxy)
except:
    proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
    proxy = proxy_response.json()
    print(proxy)
案例 - 异常重试
import parsel
import requests

def get_one_chapter(url, times=3):  # times 控制异常重试的次数, 超过这个次数任然会报错
    try:
        response = requests.get(url=url, headers=headers, timeout=0.1)
        html_data = response.text
        # print(html_data)
        selector = parsel.Selector(html_data)

        title = selector.css('h1.wap_none::text').re(' (.*)')[0]  # 标题
        contend = selector.css('#chaptercontent::text').getall()  # 小说数据
        contend = [i.replace('\u3000\u3000', '') for i in contend]
        contend = '\n'.join(contend)

        file_path = '小说\\' + str(count) + title + '.txt'
        with open(file_path, mode='w', encoding='utf-8') as f:
            f.write(contend)
            print('保存完成:', file_path)
    except Exception as e:
        print(e)
        if times >= 1:  # 可以控制异常重试的次数
            get_one_chapter(url, times=times - 1)  # 利用函数的递归的特性, 异常重试, 一直重试, 知道成功为止
            print('*' * 100)


if __name__ == '__main__':
    count = 1  # 全局变量, 用于小说章节的名字
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
    url = 'https://www.bqg70.com/book/1031/'
    response = requests.get(url=url, headers=headers)
    selector = parsel.Selector(response.text)
    dds = selector.css('.listmain>dl dd')
    for dd in dds:
        title = dd.css('a::text').get()
        link_url = dd.css('a::attr(href)').get()
        all_url = 'https://www.bqg70.com' + link_url
        # print(all_url)

        if '展开全部章节' in title:
            continue

        get_one_chapter(all_url)

        count += 1
案例 - 爬取我的钢铁网
import csv
import requests

url = 'https://index.mysteel.com/api/pricetrend/getChartMultiCity.htm'

params = {
    'catalog': '%E8%A7%92%E9%92%A2_:_%E8%A7%92%E9%92%A2',
    'city': '%E9%95%BF%E6%B2%99',
    'spec': 'Q235B%2050*50*5_:_Q235B_50*50*5',
    'startTime': '2023-03-01',
    'endTime': '2023-04-01',
    'callback': 'json',
    'v': '1688557188999',
}

response = requests.get(url=url, params=params)
json_data = response.json()
print(json_data)

city_name = json_data['data'][0]['lineName']
print(city_name)

with open('data.csv', mode='a', encoding='utf-8', newline='') as f:
    csv_write = csv.DictWriter(f, fieldnames=['city_name', 'date', 'value'])
    csv_write.writeheader()

    for i in json_data['data'][0]['dateValueMap']:
        i['city_name'] = city_name
        csv_write.writerow(i)  # 写入数据

print(requests.utils.unquote('%E9%95%BF%E6%B2%99'))
print(requests.utils.unquote('%E8%A7%92%E9%92%A2_'))
print(requests.utils.unquote('%20'))

文章来源:https://blog.csdn.net/weixin_43612602/article/details/135316904
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。