Files
MediaCrawler/proxy/proxy_ip_pool.py

146 lines
4.5 KiB
Python
Raw Normal View History

2025-11-18 12:24:02 +08:00
# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/proxy/proxy_ip_pool.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
2023-12-02 16:14:36 +08:00
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 13:45
# @Desc : ip代理池实现
import random
2024-04-05 10:44:05 +08:00
from typing import Dict, List
2023-12-02 16:14:36 +08:00
import httpx
from tenacity import retry, stop_after_attempt, wait_fixed
2024-04-05 10:44:05 +08:00
import config
from proxy.providers import (
new_kuai_daili_proxy,
new_wandou_http_proxy,
)
2023-12-02 16:14:36 +08:00
from tools import utils
2024-04-05 10:44:05 +08:00
from .base_proxy import ProxyProvider
from .types import IpInfoModel, ProviderNameEnum
2023-12-02 16:14:36 +08:00
class ProxyIpPool:
def __init__(
self, ip_pool_count: int, enable_validate_ip: bool, ip_provider: ProxyProvider
) -> None:
2024-04-05 10:44:05 +08:00
"""
Args:
ip_pool_count:
enable_validate_ip:
ip_provider:
"""
self.valid_ip_url = "https://echo.apifox.cn/" # 验证 IP 是否有效的地址
2023-12-02 16:14:36 +08:00
self.ip_pool_count = ip_pool_count
self.enable_validate_ip = enable_validate_ip
self.proxy_list: List[IpInfoModel] = []
2024-04-05 10:44:05 +08:00
self.ip_provider: ProxyProvider = ip_provider
2023-12-02 16:14:36 +08:00
async def load_proxies(self) -> None:
"""
2024-04-05 10:44:05 +08:00
加载IP代理
Returns:
2023-12-02 16:14:36 +08:00
"""
self.proxy_list = await self.ip_provider.get_proxy(self.ip_pool_count)
2023-12-02 16:14:36 +08:00
2024-04-05 10:44:05 +08:00
async def _is_valid_proxy(self, proxy: IpInfoModel) -> bool:
2023-12-02 16:14:36 +08:00
"""
验证代理IP是否有效
:param proxy:
:return:
"""
utils.logger.info(
f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} is it valid "
)
2023-12-02 16:14:36 +08:00
try:
# httpx 0.28.1 需要直接传入代理URL字符串而不是字典
if proxy.user and proxy.password:
proxy_url = f"http://{proxy.user}:{proxy.password}@{proxy.ip}:{proxy.port}"
else:
proxy_url = f"http://{proxy.ip}:{proxy.port}"
2025-11-18 12:24:02 +08:00
async with httpx.AsyncClient(proxy=proxy_url) as client:
2023-12-02 16:14:36 +08:00
response = await client.get(self.valid_ip_url)
if response.status_code == 200:
return True
else:
return False
except Exception as e:
utils.logger.info(
f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} err: {e}"
)
2023-12-02 16:14:36 +08:00
raise e
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def get_proxy(self) -> IpInfoModel:
"""
从代理池中随机提取一个代理IP
:return:
"""
if len(self.proxy_list) == 0:
2024-04-05 10:44:05 +08:00
await self._reload_proxies()
2023-12-02 16:14:36 +08:00
proxy = random.choice(self.proxy_list)
self.proxy_list.remove(proxy) # 取出来一个IP就应该移出掉
2023-12-02 16:14:36 +08:00
if self.enable_validate_ip:
2024-04-05 10:44:05 +08:00
if not await self._is_valid_proxy(proxy):
raise Exception(
"[ProxyIpPool.get_proxy] current ip invalid and again get it"
)
2023-12-02 16:14:36 +08:00
return proxy
2024-04-05 10:44:05 +08:00
async def _reload_proxies(self):
2023-12-02 16:14:36 +08:00
"""
# 重新加载代理池
:return:
"""
self.proxy_list = []
await self.load_proxies()
2024-04-05 10:44:05 +08:00
IpProxyProvider: Dict[str, ProxyProvider] = {
ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(),
ProviderNameEnum.WANDOU_HTTP_PROVIDER.value: new_wandou_http_proxy(),
2024-04-05 10:44:05 +08:00
}
async def create_ip_pool(ip_pool_count: int, enable_validate_ip: bool) -> ProxyIpPool:
2023-12-02 16:14:36 +08:00
"""
创建 IP 代理池
2024-04-05 10:44:05 +08:00
:param ip_pool_count: ip池子的数量
:param enable_validate_ip: 是否开启验证IP代理
2023-12-02 16:14:36 +08:00
:return:
"""
pool = ProxyIpPool(
ip_pool_count=ip_pool_count,
enable_validate_ip=enable_validate_ip,
ip_provider=IpProxyProvider.get(config.IP_PROXY_PROVIDER_NAME),
)
2023-12-02 16:14:36 +08:00
await pool.load_proxies()
return pool
if __name__ == "__main__":
2023-12-02 16:14:36 +08:00
pass