如何建立自己的代理IP池，减少爬虫被封的几率

import requests
from bs4 import BeautifulSoup

def get_proxies(url):
? ? response = requests.get(url)
? ? soup = BeautifulSoup(response.text, 'html.parser')
? ? proxies = []
? ? for row in soup.find_all('tr'):
? ? ? ? cols = row.find_all('td')
? ? ? ? if len(cols) >= 2:
? ? ? ? ? ? ip = cols[0].get_text()
? ? ? ? ? ? port = cols[1].get_text()
? ? ? ? ? ? proxies.append(ip + ':' + port)
? ? return proxies

# 示例使用的代理IP网站是 https://www.zdaye.com/
url = 'https://www.zdaye.com/'
proxies = get_proxies(url)
print(proxies)

2.2 付费代理IP服务商

付费代理IP服务商通常提供更稳定和可靠的代理IP。我们可以在他们的网站上购买代理IP，并获得相应的API接口来获取代理IP。以下是一个示例代码，使用付费代理IP服务商的API接口获取代理IP：

import requests

def get_proxies(api_key):
? ? url = 'http://api.service.com/proxies?api_key=' + api_key
? ? response = requests.get(url)
? ? proxies = []
? ? for proxy in response.json():
? ? ? ? proxies.append(proxy['ip'] + ':' + proxy['port'])
? ? return proxies

api_key = 'your_api_key'
proxies = get_proxies(api_key)
print(proxies)

三、验证代理IP的可用性

获取代理IP之后，我们需要验证它们的可用性，以确保它们可以正常工作。以下是一个示例代码，验证代理IP的可用性：

import requests

def test_proxy(proxy):
? ? url = 'http://www.example.com/'
? ? proxies = {
? ? ? ? 'http': 'http://' + proxy,
? ? ? ? 'https': 'https://' + proxy
? ? }
? ? try:
? ? ? ? response = requests.get(url, proxies=proxies, timeout=5)
? ? ? ? if response.status_code == 200:
? ? ? ? ? ? return True
? ? except:
? ? ? ? pass
? ? return False

# 使用之前获取的代理IP列表进行验证
valid_proxies = []
for proxy in proxies:
? ? if test_proxy(proxy):
? ? ? ? valid_proxies.append(proxy)

print(valid_proxies)

四、建立代理IP池

在验证代理IP可用性之后，我们可以将它们保存到一个代理IP池中，并定期更新和检测。以下是一个示例代码，建立代理IP池：

import random

class ProxyPool:
? ? def __init__(self):
? ? ? ? self.proxies = []
? ??
? ? def add_proxy(self, proxy):
? ? ? ? self.proxies.append(proxy)
? ??
? ? def get_proxy(self):
? ? ? ? return random.choice(self.proxies)

# 示例使用的代理IP列表
proxies = ['127.0.0.1:8888', '127.0.0.1:8889']

proxy_pool = ProxyPool()
for proxy in proxies:
? ? proxy_pool.add_proxy(proxy)

# 使用代理IP池进行爬取
url = 'http://www.example.com/'
proxy = proxy_pool.get_proxy()
proxies = {
? ? 'http': 'http://' + proxy,
? ? 'https': 'https://' + proxy
}
response = requests.get(url, proxies=proxies)
print(response.text)

五、定期更新代理IP池

为了保证代理IP池的可用性，我们需要定期更新和检测代理IP。以下是一个示例代码，定期更新代理IP池：

import time

def update_proxy_pool(proxy_pool):
? ? while True:
? ? ? ? # 更新代理IP列表
? ? ? ? proxies = get_proxies()
? ? ? ? # 检测代理IP的可用性
? ? ? ? valid_proxies = []
? ? ? ? for proxy in proxies:
? ? ? ? ? ? if test_proxy(proxy):
? ? ? ? ? ? ? ? valid_proxies.append(proxy)
? ? ? ? # 更新代理IP池
? ? ? ? proxy_pool.clear()
? ? ? ? for proxy in valid_proxies:
? ? ? ? ? ? proxy_pool.add_proxy(proxy)
? ? ? ? # 休眠一段时间后再次更新
? ? ? ? time.sleep(60*60) ?# 每小时更新一次

# 创建代理IP池
proxy_pool = ProxyPool()

# 更新代理IP池
update_proxy_pool(proxy_pool)

# 使用代理IP池进行爬取
url = 'http://www.example.com/'
proxy = proxy_pool.get_proxy()
proxies = {
? ? 'http': 'http://' + proxy,
? ? 'https': 'https://' + proxy
}
response = requests.get(url, proxies=proxies)
print(response.text)

总结

到此为止，我们已经完成了一个简单的代理IP池的建立。通过使用代理IP池，我们可以增加爬虫的隐蔽性和稳定性，降低被封的几率。同时，我们也需要注意使用合法和可靠的代理IP，以避免违法和不良行为。

文章来源:https://blog.csdn.net/wq10_12/article/details/135085280
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：veading@qq.com进行投诉反馈，一经查实，立即删除！