获取网页信息

2024-01-03 15:30:54

每次copy & paste总是很麻烦,现在有点问题,先记录下来。

需求:获取url 里Feature list,并输出表格形式

可以用Convert curl commands to code:得到get请求的header,cookie等?

import requests
import re
from json2html import json2html
from bs4 import BeautifulSoup

cookies = {
    '_ga': 'GA1.2.1362872320.1699326902',
    '_fbp': 'fb.1.1703745569173.788449175',
    '_zm_visitor_guid': 'ab14067a105b55591ca36931e79a6fc0',
    '_zm_mtk_guid': 'b214987e283ec1df03f09df41170675b',
    '_ds_id': '8c2d2994-3b41-4b59-be95-2b8717ffe0e6',
    '__utmzz': 'source=(direct)|medium=(none)|campaign=(not set)',
    'AMP_MKTG_0753e77572': 'JTdCJTdE',
    '_gcl_au': '1.1.55355038.1703817513',
    'OnetrustActiveGroups': 'C0004C0003C0002C0001',
    'AMP_0753e77572': 'JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjJkYWQyMGM3NS0xYzdkLTRmODYtYjI4Yi03MTNmZTNlY2E5ZjglMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzAzODE3NTEyNDY3JTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcwMzgxNzUxMzMxMiUyQyUyMmxhc3RFdmVudElkJTIyJTNBMyU3RA==',
    '_yjsu_yjad': '1703817513.e9d3aadf-244b-4756-90c8-d8152831b27e',
    '_uetvid': '5c32b050a5f311ee8e0337e664efcd94',
    'iv': '51a85645-5246-4995-9a5b-627ccafbae0b',
    '_cs_c': '0',
    '_cs_id': '0b459793-a9d5-a89c-c1e2-70499565b08c.1703817514.2.1703833540.1703833540.1.1737981514343',
    '_gid': 'GA1.2.1035150089.1704176623',
    'BIGipServerpool_zoomus': '2f6ba358017c66e5283571a5c5fc3b1a',
    'JSESSIONID': '2C44F6D93F6593E571F97C2BAE1AF4DB',
    'glide_user_route': 'glide.de6ecf26cf6f93e1b52b94d2be12e7df',
    'glide_language': 'zh',
    'OptanonAlertBoxClosed': '2024-01-03T04:07:00.123Z',
    'OptanonConsent': 'isGpcEnabled=0&datestamp=Wed+Jan+03+2024+12%3A07%3A00+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202310.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=e9dfd41b-73f2-470f-ab16-4e504558809b&interactionCount=32&landingPath=NotLandingPage&groups=C0004%3A0%2CC0003%3A0%2CC0002%3A0%2CC0001%3A1&geolocation=JP%3B13&AwaitingReconsent=false',
}

headers = {
    'Accept': 'application/json',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    # 'Cookie': '_ga=GA1.2.1362872320.1699326902; _fbp=fb.1.1703745569173.788449175; _zm_visitor_guid=ab14067a105b55591ca36931e79a6fc0; _zm_mtk_guid=b214987e283ec1df03f09df41170675b; _ds_id=8c2d2994-3b41-4b59-be95-2b8717ffe0e6; __utmzz=source=(direct)|medium=(none)|campaign=(not set); AMP_MKTG_0753e77572=JTdCJTdE; _gcl_au=1.1.55355038.1703817513; OnetrustActiveGroups=C0004C0003C0002C0001; AMP_0753e77572=JTdCJTIyZGV2aWNlSWQlMjIlM0ElMjJkYWQyMGM3NS0xYzdkLTRmODYtYjI4Yi03MTNmZTNlY2E5ZjglMjIlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNzAzODE3NTEyNDY3JTJDJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJsYXN0RXZlbnRUaW1lJTIyJTNBMTcwMzgxNzUxMzMxMiUyQyUyMmxhc3RFdmVudElkJTIyJTNBMyU3RA==; _yjsu_yjad=1703817513.e9d3aadf-244b-4756-90c8-d8152831b27e; _uetvid=5c32b050a5f311ee8e0337e664efcd94; iv=51a85645-5246-4995-9a5b-627ccafbae0b; _cs_c=0; _cs_id=0b459793-a9d5-a89c-c1e2-70499565b08c.1703817514.2.1703833540.1703833540.1.1737981514343; _gid=GA1.2.1035150089.1704176623; BIGipServerpool_zoomus=2f6ba358017c66e5283571a5c5fc3b1a; JSESSIONID=2C44F6D93F6593E571F97C2BAE1AF4DB; glide_user_route=glide.de6ecf26cf6f93e1b52b94d2be12e7df; glide_language=zh; OptanonAlertBoxClosed=2024-01-03T04:07:00.123Z; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Jan+03+2024+12%3A07%3A00+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202310.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=e9dfd41b-73f2-470f-ab16-4e504558809b&interactionCount=32&landingPath=NotLandingPage&groups=C0004%3A0%2CC0003%3A0%2CC0002%3A0%2CC0001%3A1&geolocation=JP%3B13&AwaitingReconsent=false',
    'Referer': 'https://support.zoom.com/hc/zh/article?id=zm_kb&sysparm_article=KB0069432',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'X-Transaction-Source': 'Interface=Web,Interface-Name=HC,Interface-Type=Service Portal,Interface-SysID=89275a53cb13020000f8d856634c9c51',
    'X-Use-Polaris': 'false',
    'X-UserToken': '51f7263487ef711481aec8cd0ebb355c186ebdcd75d1cab6f29335aa03a871b3bffff3f0',
    'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'x-portal': '89275a53cb13020000f8d856634c9c51',
}

params = {
    'id': 'zm_kb',
    'sysparm_article': 'KB0069432',
    'time': '1704254869320',
    'portal_id': '89275a53cb13020000f8d856634c9c51',
    # 89275a53cb13020000f8d856634c9c51
    'request_uri': '/hc/zh/article?id=zm_kb&sysparm_article=KB0069432',
}

response = requests.get('https://support.zoom.com/api/now/sp/page', params=params, cookies=cookies, headers=headers)


data = response.text
# pattern = r'"kbContentData": \{'
pattern = r'"kbContentData":(.*/?)<li>Security enhancements'
ret = re.findall(pattern, data)[0]#["data"]

?看得出,得到的response为Json格式,但是我要获取的网页内容在kbContentData下

?但是数据解析难住我了(已知 数据为转义后的html内容)

尝试方法1:用re,但尝试如下:加了空格和\{的匹配后就获取不到匹配项了,用\s代替空格也不行,明明网页上复制可以直接找得到的???但就是匹配为空

pattern = r'"kbContentData": \{'

尝试方法2:可以得到数据,但是<\/strong, 加了反斜杠转义后的字符串,明明是html的语言,写出来的脚本,但是我不知道怎么直接把 str(加了反斜杠转义后的字符串) --》 转成html,可以his用lxml或者bs4 进行解析???

 

搞不懂,明明刚学了re,没解决。。。(╬◣д◢)

文章来源:https://blog.csdn.net/plato_2/article/details/135362822
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。