mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
146 lines
4.5 KiB
Python
146 lines
4.5 KiB
Python
# -*- coding: utf-8 -*-
|
||
# Copyright (c) 2025 relakkes@gmail.com
|
||
#
|
||
# This file is part of MediaCrawler project.
|
||
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/proxy/proxy_ip_pool.py
|
||
# GitHub: https://github.com/NanmiCoder
|
||
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||
#
|
||
|
||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||
# 1. 不得用于任何商业用途。
|
||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||
# 5. 不得用于任何非法或不当的用途。
|
||
#
|
||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||
|
||
# -*- coding: utf-8 -*-
|
||
# @Author : relakkes@gmail.com
|
||
# @Time : 2023/12/2 13:45
|
||
# @Desc : ip代理池实现
|
||
import random
|
||
from typing import Dict, List
|
||
|
||
import httpx
|
||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||
|
||
import config
|
||
from proxy.providers import (
|
||
new_kuai_daili_proxy,
|
||
new_wandou_http_proxy,
|
||
)
|
||
from tools import utils
|
||
|
||
from .base_proxy import ProxyProvider
|
||
from .types import IpInfoModel, ProviderNameEnum
|
||
|
||
|
||
class ProxyIpPool:
|
||
|
||
def __init__(
|
||
self, ip_pool_count: int, enable_validate_ip: bool, ip_provider: ProxyProvider
|
||
) -> None:
|
||
"""
|
||
|
||
Args:
|
||
ip_pool_count:
|
||
enable_validate_ip:
|
||
ip_provider:
|
||
"""
|
||
self.valid_ip_url = "https://echo.apifox.cn/" # 验证 IP 是否有效的地址
|
||
self.ip_pool_count = ip_pool_count
|
||
self.enable_validate_ip = enable_validate_ip
|
||
self.proxy_list: List[IpInfoModel] = []
|
||
self.ip_provider: ProxyProvider = ip_provider
|
||
|
||
async def load_proxies(self) -> None:
|
||
"""
|
||
加载IP代理
|
||
Returns:
|
||
|
||
"""
|
||
self.proxy_list = await self.ip_provider.get_proxy(self.ip_pool_count)
|
||
|
||
async def _is_valid_proxy(self, proxy: IpInfoModel) -> bool:
|
||
"""
|
||
验证代理IP是否有效
|
||
:param proxy:
|
||
:return:
|
||
"""
|
||
utils.logger.info(
|
||
f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} is it valid "
|
||
)
|
||
try:
|
||
# httpx 0.28.1 需要直接传入代理URL字符串,而不是字典
|
||
if proxy.user and proxy.password:
|
||
proxy_url = f"http://{proxy.user}:{proxy.password}@{proxy.ip}:{proxy.port}"
|
||
else:
|
||
proxy_url = f"http://{proxy.ip}:{proxy.port}"
|
||
|
||
async with httpx.AsyncClient(proxy=proxy_url) as client:
|
||
response = await client.get(self.valid_ip_url)
|
||
if response.status_code == 200:
|
||
return True
|
||
else:
|
||
return False
|
||
except Exception as e:
|
||
utils.logger.info(
|
||
f"[ProxyIpPool._is_valid_proxy] testing {proxy.ip} err: {e}"
|
||
)
|
||
raise e
|
||
|
||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||
async def get_proxy(self) -> IpInfoModel:
|
||
"""
|
||
从代理池中随机提取一个代理IP
|
||
:return:
|
||
"""
|
||
if len(self.proxy_list) == 0:
|
||
await self._reload_proxies()
|
||
|
||
proxy = random.choice(self.proxy_list)
|
||
self.proxy_list.remove(proxy) # 取出来一个IP就应该移出掉
|
||
if self.enable_validate_ip:
|
||
if not await self._is_valid_proxy(proxy):
|
||
raise Exception(
|
||
"[ProxyIpPool.get_proxy] current ip invalid and again get it"
|
||
)
|
||
return proxy
|
||
|
||
async def _reload_proxies(self):
|
||
"""
|
||
# 重新加载代理池
|
||
:return:
|
||
"""
|
||
self.proxy_list = []
|
||
await self.load_proxies()
|
||
|
||
|
||
IpProxyProvider: Dict[str, ProxyProvider] = {
|
||
ProviderNameEnum.KUAI_DAILI_PROVIDER.value: new_kuai_daili_proxy(),
|
||
ProviderNameEnum.WANDOU_HTTP_PROVIDER.value: new_wandou_http_proxy(),
|
||
}
|
||
|
||
|
||
async def create_ip_pool(ip_pool_count: int, enable_validate_ip: bool) -> ProxyIpPool:
|
||
"""
|
||
创建 IP 代理池
|
||
:param ip_pool_count: ip池子的数量
|
||
:param enable_validate_ip: 是否开启验证IP代理
|
||
:return:
|
||
"""
|
||
pool = ProxyIpPool(
|
||
ip_pool_count=ip_pool_count,
|
||
enable_validate_ip=enable_validate_ip,
|
||
ip_provider=IpProxyProvider.get(config.IP_PROXY_PROVIDER_NAME),
|
||
)
|
||
await pool.load_proxies()
|
||
return pool
|
||
|
||
|
||
if __name__ == "__main__":
|
||
pass
|