记录爬取《猫眼电影》-影名-作者-上映时间代码
2023-12-24 12:23:11
import time
from lxml import etree
import requests
import pymysql
class MaoYanSpider(object):
def __init__(self):
self.url = "https://www.maoyan.com/board/4?offset={}"
# 因为反爬有可能headers有问题,更换一个即可
self.headers = { # 设置header
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'referer': 'https://passport.meituan.com/',
'Cookie': '__mta=42753434.1633656738499.1634781127005.1634781128998.34; uuid_n_v=v1; _lxsdk_cuid=17c5d879290c8-03443510ba6172-6373267-144000-17c5d879291c8; uuid=60ACEF00317A11ECAAC07D88ABE178B722CFA72214D742A2849B46660B8F79A8; _lxsdk=60ACEF00317A11ECAAC07D88ABE178B722CFA72214D742A2849B46660B8F79A8; _csrf=94b23e138a83e44c117736c59d0901983cb89b75a2c0de2587b8c273d115e639; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1634716251,1634716252,1634719353,1634779997; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1634781129; _lxsdk_s=17ca07b2470-536-b73-84%7C%7C12'
}
# self.proxies = {'http':'http://121.41.122.215','https':'121.41.122.215'}
# 链接数据库
self.db = pymysql.connect(host="127.0.0.1", user='root', password='', db='maoyandb')
# 取得数据库游标(类似于java的Statement操作对象)
self.cursor = self.db.cursor()
def get_html(self, url):
html = requests.get(url=url, headers=self.headers).text
# print(html)
r_list = etree.HTML(html).xpath('//dl[@class="board-wrapper"]/dd')
# print(r_list)
items = []
for dd in r_list:
i =(
dd.xpath('.//p[@class="name"]/a/text()')[0].strip(),
dd.xpath('.//p[@class="star"]/text()')[0].strip()[3:],
dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:15]
)
# print(i)
items.append(i)
self.save_html(items)
def save_html(self,items):
try:
sql = 'insert into filmtab(name,star,time)values(%s,%s,%s)'
self.cursor.executemany(sql,items)
self.db.commit()
except Exception as e:
self.db.rollback()
print(str(e.args))
def run(self):
offset = int(input("请输入页码:"))
url = self.url.format((offset-1)*10)
self.get_html(url)
# 断开数据库资源
self.cursor.close()
self.db.close()
if __name__ == '__main__':
start = time.time()
spider = MaoYanSpider()
spider.run()
end = time.time()
print("数据抓取完毕,总耗时:%.2f" % (end - start))
文章来源:https://blog.csdn.net/qwy715229258163/article/details/135179259
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!