获取网页信息
2024-01-03 15:30:54
每次copy & paste总是很麻烦,现在有点问题,先记录下来。
需求:获取url 里Feature list,并输出表格形式
可以用Convert curl commands to code:得到get请求的header,cookie等?
import requests
import re
from json2html import json2html
from bs4 import BeautifulSoup
cookies = {
'_ga': 'GA1.2.1362872320.1699326902',
'_fbp': 'fb.1.1703745569173.788449175',
'_zm_visitor_guid': 'ab14067a105b55591ca36931e79a6fc0',
'_zm_mtk_guid': 'b214987e283ec1df03f09df41170675b',
'_ds_id': '8c2d2994-3b41-4b59-be95-2b8717ffe0e6',
'__utmzz': 'source=(direct)|medium=(none)|campaign=(not set)',
'AMP_MKTG_0753e77572': 'JTdCJTdE',
'_gcl_au': '1.1.55355038.1703817513',
'OnetrustActiveGroups': 'C0004C0003C0002C0001',
'AMP_0753e77572': 'JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjJkYWQyMGM3NS0xYzdkLTRmODYtYjI4Yi03MTNmZTNlY2E5ZjglMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzAzODE3NTEyNDY3JTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcwMzgxNzUxMzMxMiUyQyUyMmxhc3RFdmVudElkJTIyJTNBMyU3RA==',
'_yjsu_yjad': '1703817513.e9d3aadf-244b-4756-90c8-d8152831b27e',
'_uetvid': '5c32b050a5f311ee8e0337e664efcd94',
'iv': '51a85645-5246-4995-9a5b-627ccafbae0b',
'_cs_c': '0',
'_cs_id': '0b459793-a9d5-a89c-c1e2-70499565b08c.1703817514.2.1703833540.1703833540.1.1737981514343',
'_gid': 'GA1.2.1035150089.1704176623',
'BIGipServerpool_zoomus': '2f6ba358017c66e5283571a5c5fc3b1a',
'JSESSIONID': '2C44F6D93F6593E571F97C2BAE1AF4DB',
'glide_user_route': 'glide.de6ecf26cf6f93e1b52b94d2be12e7df',
'glide_language': 'zh',
'OptanonAlertBoxClosed': '2024-01-03T04:07:00.123Z',
'OptanonConsent': 'isGpcEnabled=0&datestamp=Wed+Jan+03+2024+12%3A07%3A00+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202310.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=e9dfd41b-73f2-470f-ab16-4e504558809b&interactionCount=32&landingPath=NotLandingPage&groups=C0004%3A0%2CC0003%3A0%2CC0002%3A0%2CC0001%3A1&geolocation=JP%3B13&AwaitingReconsent=false',
}
headers = {
'Accept': 'application/json',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
# 'Cookie': '_ga=GA1.2.1362872320.1699326902; _fbp=fb.1.1703745569173.788449175; _zm_visitor_guid=ab14067a105b55591ca36931e79a6fc0; _zm_mtk_guid=b214987e283ec1df03f09df41170675b; _ds_id=8c2d2994-3b41-4b59-be95-2b8717ffe0e6; __utmzz=source=(direct)|medium=(none)|campaign=(not set); AMP_MKTG_0753e77572=JTdCJTdE; _gcl_au=1.1.55355038.1703817513; OnetrustActiveGroups=C0004C0003C0002C0001; AMP_0753e77572=JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjJkYWQyMGM3NS0xYzdkLTRmODYtYjI4Yi03MTNmZTNlY2E5ZjglMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzAzODE3NTEyNDY3JTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcwMzgxNzUxMzMxMiUyQyUyMmxhc3RFdmVudElkJTIyJTNBMyU3RA==; _yjsu_yjad=1703817513.e9d3aadf-244b-4756-90c8-d8152831b27e; _uetvid=5c32b050a5f311ee8e0337e664efcd94; iv=51a85645-5246-4995-9a5b-627ccafbae0b; _cs_c=0; _cs_id=0b459793-a9d5-a89c-c1e2-70499565b08c.1703817514.2.1703833540.1703833540.1.1737981514343; _gid=GA1.2.1035150089.1704176623; BIGipServerpool_zoomus=2f6ba358017c66e5283571a5c5fc3b1a; JSESSIONID=2C44F6D93F6593E571F97C2BAE1AF4DB; glide_user_route=glide.de6ecf26cf6f93e1b52b94d2be12e7df; glide_language=zh; OptanonAlertBoxClosed=2024-01-03T04:07:00.123Z; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Jan+03+2024+12%3A07%3A00+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202310.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=e9dfd41b-73f2-470f-ab16-4e504558809b&interactionCount=32&landingPath=NotLandingPage&groups=C0004%3A0%2CC0003%3A0%2CC0002%3A0%2CC0001%3A1&geolocation=JP%3B13&AwaitingReconsent=false',
'Referer': 'https://support.zoom.com/hc/zh/article?id=zm_kb&sysparm_article=KB0069432',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'X-Transaction-Source': 'Interface=Web,Interface-Name=HC,Interface-Type=Service Portal,Interface-SysID=89275a53cb13020000f8d856634c9c51',
'X-Use-Polaris': 'false',
'X-UserToken': '51f7263487ef711481aec8cd0ebb355c186ebdcd75d1cab6f29335aa03a871b3bffff3f0',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'x-portal': '89275a53cb13020000f8d856634c9c51',
}
params = {
'id': 'zm_kb',
'sysparm_article': 'KB0069432',
'time': '1704254869320',
'portal_id': '89275a53cb13020000f8d856634c9c51',
# 89275a53cb13020000f8d856634c9c51
'request_uri': '/hc/zh/article?id=zm_kb&sysparm_article=KB0069432',
}
response = requests.get('https://support.zoom.com/api/now/sp/page', params=params, cookies=cookies, headers=headers)
data = response.text
# pattern = r'"kbContentData": \{'
pattern = r'"kbContentData":(.*/?)<li>Security enhancements'
ret = re.findall(pattern, data)[0]#["data"]
?看得出,得到的response为Json格式,但是我要获取的网页内容在kbContentData下
?但是数据解析难住我了(已知 数据为转义后的html内容)
尝试方法1:用re,但尝试如下:加了空格和\{的匹配后就获取不到匹配项了,用\s代替空格也不行,明明网页上复制可以直接找得到的???但就是匹配为空
pattern = r'"kbContentData": \{'
尝试方法2:可以得到数据,但是<\/strong, 加了反斜杠转义后的字符串,明明是html的语言,写出来的脚本,但是我不知道怎么直接把 str(加了反斜杠转义后的字符串) --》 转成html,可以his用lxml或者bs4 进行解析???
搞不懂,明明刚学了re,没解决。。。(╬◣д◢)
文章来源:https://blog.csdn.net/plato_2/article/details/135362822
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!