mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
290 lines
11 KiB
Python
290 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
||
# Copyright (c) 2025 relakkes@gmail.com
|
||
#
|
||
# This file is part of MediaCrawler project.
|
||
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tools/browser_launcher.py
|
||
# GitHub: https://github.com/NanmiCoder
|
||
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
||
#
|
||
|
||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||
# 1. 不得用于任何商业用途。
|
||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||
# 5. 不得用于任何非法或不当的用途。
|
||
#
|
||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||
|
||
|
||
import os
|
||
import platform
|
||
import subprocess
|
||
import time
|
||
import socket
|
||
import signal
|
||
from typing import Optional, List, Tuple
|
||
import asyncio
|
||
from pathlib import Path
|
||
|
||
from tools import utils
|
||
|
||
|
||
class BrowserLauncher:
|
||
"""
|
||
浏览器启动器,用于检测和启动用户的Chrome/Edge浏览器
|
||
支持Windows和macOS系统
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.system = platform.system()
|
||
self.browser_process = None
|
||
self.debug_port = None
|
||
|
||
def detect_browser_paths(self) -> List[str]:
|
||
"""
|
||
检测系统中可用的浏览器路径
|
||
返回按优先级排序的浏览器路径列表
|
||
"""
|
||
paths = []
|
||
|
||
if self.system == "Windows":
|
||
# Windows下的常见Chrome/Edge安装路径
|
||
possible_paths = [
|
||
# Chrome路径
|
||
os.path.expandvars(r"%PROGRAMFILES%\Google\Chrome\Application\chrome.exe"),
|
||
os.path.expandvars(r"%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe"),
|
||
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe"),
|
||
# Edge路径
|
||
os.path.expandvars(r"%PROGRAMFILES%\Microsoft\Edge\Application\msedge.exe"),
|
||
os.path.expandvars(r"%PROGRAMFILES(X86)%\Microsoft\Edge\Application\msedge.exe"),
|
||
# Chrome Beta/Dev/Canary
|
||
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome Beta\Application\chrome.exe"),
|
||
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome Dev\Application\chrome.exe"),
|
||
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome SxS\Application\chrome.exe"),
|
||
]
|
||
elif self.system == "Darwin": # macOS
|
||
# macOS下的常见Chrome/Edge安装路径
|
||
possible_paths = [
|
||
# Chrome路径
|
||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta",
|
||
"/Applications/Google Chrome Dev.app/Contents/MacOS/Google Chrome Dev",
|
||
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||
# Edge路径
|
||
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
||
"/Applications/Microsoft Edge Beta.app/Contents/MacOS/Microsoft Edge Beta",
|
||
"/Applications/Microsoft Edge Dev.app/Contents/MacOS/Microsoft Edge Dev",
|
||
"/Applications/Microsoft Edge Canary.app/Contents/MacOS/Microsoft Edge Canary",
|
||
]
|
||
else:
|
||
# Linux等其他系统
|
||
possible_paths = [
|
||
"/usr/bin/google-chrome",
|
||
"/usr/bin/google-chrome-stable",
|
||
"/usr/bin/google-chrome-beta",
|
||
"/usr/bin/google-chrome-unstable",
|
||
"/usr/bin/chromium-browser",
|
||
"/usr/bin/chromium",
|
||
"/snap/bin/chromium",
|
||
"/usr/bin/microsoft-edge",
|
||
"/usr/bin/microsoft-edge-stable",
|
||
"/usr/bin/microsoft-edge-beta",
|
||
"/usr/bin/microsoft-edge-dev",
|
||
]
|
||
|
||
# 检查路径是否存在且可执行
|
||
for path in possible_paths:
|
||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||
paths.append(path)
|
||
|
||
return paths
|
||
|
||
def find_available_port(self, start_port: int = 9222) -> int:
|
||
"""
|
||
查找可用的端口
|
||
"""
|
||
port = start_port
|
||
while port < start_port + 100: # 最多尝试100个端口
|
||
try:
|
||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||
s.bind(('localhost', port))
|
||
return port
|
||
except OSError:
|
||
port += 1
|
||
|
||
raise RuntimeError(f"无法找到可用的端口,已尝试 {start_port} 到 {port-1}")
|
||
|
||
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
|
||
user_data_dir: Optional[str] = None) -> subprocess.Popen:
|
||
"""
|
||
启动浏览器进程
|
||
"""
|
||
# 基本启动参数
|
||
args = [
|
||
browser_path,
|
||
f"--remote-debugging-port={debug_port}",
|
||
"--remote-debugging-address=0.0.0.0", # 允许远程访问
|
||
"--no-first-run",
|
||
"--no-default-browser-check",
|
||
"--disable-background-timer-throttling",
|
||
"--disable-backgrounding-occluded-windows",
|
||
"--disable-renderer-backgrounding",
|
||
"--disable-features=TranslateUI",
|
||
"--disable-ipc-flooding-protection",
|
||
"--disable-hang-monitor",
|
||
"--disable-prompt-on-repost",
|
||
"--disable-sync",
|
||
"--disable-dev-shm-usage", # 避免共享内存问题
|
||
"--no-sandbox", # 在CDP模式下关闭沙箱
|
||
# 🔥 关键反检测参数
|
||
"--disable-blink-features=AutomationControlled", # 禁用自动化控制标记
|
||
"--exclude-switches=enable-automation", # 排除自动化开关
|
||
"--disable-infobars", # 禁用信息栏
|
||
]
|
||
|
||
# 无头模式
|
||
if headless:
|
||
args.extend([
|
||
"--headless=new", # 使用新的headless模式
|
||
"--disable-gpu",
|
||
])
|
||
else:
|
||
# 非无头模式的额外参数
|
||
args.extend([
|
||
"--start-maximized", # 最大化窗口,更像真实用户
|
||
])
|
||
|
||
# 用户数据目录
|
||
if user_data_dir:
|
||
args.append(f"--user-data-dir={user_data_dir}")
|
||
|
||
utils.logger.info(f"[BrowserLauncher] 启动浏览器: {browser_path}")
|
||
utils.logger.info(f"[BrowserLauncher] 调试端口: {debug_port}")
|
||
utils.logger.info(f"[BrowserLauncher] 无头模式: {headless}")
|
||
|
||
try:
|
||
# 在Windows上,使用CREATE_NEW_PROCESS_GROUP避免Ctrl+C影响子进程
|
||
if self.system == "Windows":
|
||
process = subprocess.Popen(
|
||
args,
|
||
stdout=subprocess.DEVNULL,
|
||
stderr=subprocess.DEVNULL,
|
||
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
|
||
)
|
||
else:
|
||
process = subprocess.Popen(
|
||
args,
|
||
stdout=subprocess.DEVNULL,
|
||
stderr=subprocess.DEVNULL,
|
||
preexec_fn=os.setsid # 创建新的进程组
|
||
)
|
||
|
||
self.browser_process = process
|
||
return process
|
||
|
||
except Exception as e:
|
||
utils.logger.error(f"[BrowserLauncher] 启动浏览器失败: {e}")
|
||
raise
|
||
|
||
def wait_for_browser_ready(self, debug_port: int, timeout: int = 30) -> bool:
|
||
"""
|
||
等待浏览器准备就绪
|
||
"""
|
||
utils.logger.info(f"[BrowserLauncher] 等待浏览器在端口 {debug_port} 上准备就绪...")
|
||
|
||
start_time = time.time()
|
||
while time.time() - start_time < timeout:
|
||
try:
|
||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||
s.settimeout(1)
|
||
result = s.connect_ex(('localhost', debug_port))
|
||
if result == 0:
|
||
utils.logger.info(f"[BrowserLauncher] 浏览器已在端口 {debug_port} 上准备就绪")
|
||
return True
|
||
except Exception:
|
||
pass
|
||
|
||
time.sleep(0.5)
|
||
|
||
utils.logger.error(f"[BrowserLauncher] 浏览器在 {timeout} 秒内未能准备就绪")
|
||
return False
|
||
|
||
def get_browser_info(self, browser_path: str) -> Tuple[str, str]:
|
||
"""
|
||
获取浏览器信息(名称和版本)
|
||
"""
|
||
try:
|
||
if "chrome" in browser_path.lower():
|
||
name = "Google Chrome"
|
||
elif "edge" in browser_path.lower() or "msedge" in browser_path.lower():
|
||
name = "Microsoft Edge"
|
||
elif "chromium" in browser_path.lower():
|
||
name = "Chromium"
|
||
else:
|
||
name = "Unknown Browser"
|
||
|
||
# 尝试获取版本信息
|
||
try:
|
||
result = subprocess.run([browser_path, "--version"],
|
||
capture_output=True, text=True, timeout=5)
|
||
version = result.stdout.strip() if result.stdout else "Unknown Version"
|
||
except:
|
||
version = "Unknown Version"
|
||
|
||
return name, version
|
||
|
||
except Exception:
|
||
return "Unknown Browser", "Unknown Version"
|
||
|
||
def cleanup(self):
|
||
"""
|
||
清理资源,关闭浏览器进程
|
||
"""
|
||
if not self.browser_process:
|
||
return
|
||
|
||
process = self.browser_process
|
||
|
||
if process.poll() is not None:
|
||
utils.logger.info("[BrowserLauncher] 浏览器进程已退出,无需清理")
|
||
self.browser_process = None
|
||
return
|
||
|
||
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
|
||
|
||
try:
|
||
if self.system == "Windows":
|
||
# 先尝试正常终止
|
||
process.terminate()
|
||
try:
|
||
process.wait(timeout=5)
|
||
except subprocess.TimeoutExpired:
|
||
utils.logger.warning("[BrowserLauncher] 正常终止超时,使用taskkill强制结束")
|
||
subprocess.run(
|
||
["taskkill", "/F", "/T", "/PID", str(process.pid)],
|
||
capture_output=True,
|
||
check=False,
|
||
)
|
||
process.wait(timeout=5)
|
||
else:
|
||
pgid = os.getpgid(process.pid)
|
||
try:
|
||
os.killpg(pgid, signal.SIGTERM)
|
||
except ProcessLookupError:
|
||
utils.logger.info("[BrowserLauncher] 浏览器进程组不存在,可能已退出")
|
||
else:
|
||
try:
|
||
process.wait(timeout=5)
|
||
except subprocess.TimeoutExpired:
|
||
utils.logger.warning("[BrowserLauncher] 优雅关闭超时,发送SIGKILL")
|
||
os.killpg(pgid, signal.SIGKILL)
|
||
process.wait(timeout=5)
|
||
|
||
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
|
||
except Exception as e:
|
||
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
|
||
finally:
|
||
self.browser_process = None
|