Files
MediaCrawler/tools/browser_launcher.py
2025-11-18 12:24:02 +08:00

290 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tools/browser_launcher.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import os
import platform
import subprocess
import time
import socket
import signal
from typing import Optional, List, Tuple
import asyncio
from pathlib import Path
from tools import utils
class BrowserLauncher:
"""
浏览器启动器用于检测和启动用户的Chrome/Edge浏览器
支持Windows和macOS系统
"""
def __init__(self):
self.system = platform.system()
self.browser_process = None
self.debug_port = None
def detect_browser_paths(self) -> List[str]:
"""
检测系统中可用的浏览器路径
返回按优先级排序的浏览器路径列表
"""
paths = []
if self.system == "Windows":
# Windows下的常见Chrome/Edge安装路径
possible_paths = [
# Chrome路径
os.path.expandvars(r"%PROGRAMFILES%\Google\Chrome\Application\chrome.exe"),
os.path.expandvars(r"%PROGRAMFILES(X86)%\Google\Chrome\Application\chrome.exe"),
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome\Application\chrome.exe"),
# Edge路径
os.path.expandvars(r"%PROGRAMFILES%\Microsoft\Edge\Application\msedge.exe"),
os.path.expandvars(r"%PROGRAMFILES(X86)%\Microsoft\Edge\Application\msedge.exe"),
# Chrome Beta/Dev/Canary
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome Beta\Application\chrome.exe"),
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome Dev\Application\chrome.exe"),
os.path.expandvars(r"%LOCALAPPDATA%\Google\Chrome SxS\Application\chrome.exe"),
]
elif self.system == "Darwin": # macOS
# macOS下的常见Chrome/Edge安装路径
possible_paths = [
# Chrome路径
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Beta.app/Contents/MacOS/Google Chrome Beta",
"/Applications/Google Chrome Dev.app/Contents/MacOS/Google Chrome Dev",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
# Edge路径
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
"/Applications/Microsoft Edge Beta.app/Contents/MacOS/Microsoft Edge Beta",
"/Applications/Microsoft Edge Dev.app/Contents/MacOS/Microsoft Edge Dev",
"/Applications/Microsoft Edge Canary.app/Contents/MacOS/Microsoft Edge Canary",
]
else:
# Linux等其他系统
possible_paths = [
"/usr/bin/google-chrome",
"/usr/bin/google-chrome-stable",
"/usr/bin/google-chrome-beta",
"/usr/bin/google-chrome-unstable",
"/usr/bin/chromium-browser",
"/usr/bin/chromium",
"/snap/bin/chromium",
"/usr/bin/microsoft-edge",
"/usr/bin/microsoft-edge-stable",
"/usr/bin/microsoft-edge-beta",
"/usr/bin/microsoft-edge-dev",
]
# 检查路径是否存在且可执行
for path in possible_paths:
if os.path.isfile(path) and os.access(path, os.X_OK):
paths.append(path)
return paths
def find_available_port(self, start_port: int = 9222) -> int:
"""
查找可用的端口
"""
port = start_port
while port < start_port + 100: # 最多尝试100个端口
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('localhost', port))
return port
except OSError:
port += 1
raise RuntimeError(f"无法找到可用的端口,已尝试 {start_port}{port-1}")
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
user_data_dir: Optional[str] = None) -> subprocess.Popen:
"""
启动浏览器进程
"""
# 基本启动参数
args = [
browser_path,
f"--remote-debugging-port={debug_port}",
"--remote-debugging-address=0.0.0.0", # 允许远程访问
"--no-first-run",
"--no-default-browser-check",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-features=TranslateUI",
"--disable-ipc-flooding-protection",
"--disable-hang-monitor",
"--disable-prompt-on-repost",
"--disable-sync",
"--disable-dev-shm-usage", # 避免共享内存问题
"--no-sandbox", # 在CDP模式下关闭沙箱
# 🔥 关键反检测参数
"--disable-blink-features=AutomationControlled", # 禁用自动化控制标记
"--exclude-switches=enable-automation", # 排除自动化开关
"--disable-infobars", # 禁用信息栏
]
# 无头模式
if headless:
args.extend([
"--headless=new", # 使用新的headless模式
"--disable-gpu",
])
else:
# 非无头模式的额外参数
args.extend([
"--start-maximized", # 最大化窗口,更像真实用户
])
# 用户数据目录
if user_data_dir:
args.append(f"--user-data-dir={user_data_dir}")
utils.logger.info(f"[BrowserLauncher] 启动浏览器: {browser_path}")
utils.logger.info(f"[BrowserLauncher] 调试端口: {debug_port}")
utils.logger.info(f"[BrowserLauncher] 无头模式: {headless}")
try:
# 在Windows上使用CREATE_NEW_PROCESS_GROUP避免Ctrl+C影响子进程
if self.system == "Windows":
process = subprocess.Popen(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
)
else:
process = subprocess.Popen(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
preexec_fn=os.setsid # 创建新的进程组
)
self.browser_process = process
return process
except Exception as e:
utils.logger.error(f"[BrowserLauncher] 启动浏览器失败: {e}")
raise
def wait_for_browser_ready(self, debug_port: int, timeout: int = 30) -> bool:
"""
等待浏览器准备就绪
"""
utils.logger.info(f"[BrowserLauncher] 等待浏览器在端口 {debug_port} 上准备就绪...")
start_time = time.time()
while time.time() - start_time < timeout:
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1)
result = s.connect_ex(('localhost', debug_port))
if result == 0:
utils.logger.info(f"[BrowserLauncher] 浏览器已在端口 {debug_port} 上准备就绪")
return True
except Exception:
pass
time.sleep(0.5)
utils.logger.error(f"[BrowserLauncher] 浏览器在 {timeout} 秒内未能准备就绪")
return False
def get_browser_info(self, browser_path: str) -> Tuple[str, str]:
"""
获取浏览器信息(名称和版本)
"""
try:
if "chrome" in browser_path.lower():
name = "Google Chrome"
elif "edge" in browser_path.lower() or "msedge" in browser_path.lower():
name = "Microsoft Edge"
elif "chromium" in browser_path.lower():
name = "Chromium"
else:
name = "Unknown Browser"
# 尝试获取版本信息
try:
result = subprocess.run([browser_path, "--version"],
capture_output=True, text=True, timeout=5)
version = result.stdout.strip() if result.stdout else "Unknown Version"
except:
version = "Unknown Version"
return name, version
except Exception:
return "Unknown Browser", "Unknown Version"
def cleanup(self):
"""
清理资源,关闭浏览器进程
"""
if not self.browser_process:
return
process = self.browser_process
if process.poll() is not None:
utils.logger.info("[BrowserLauncher] 浏览器进程已退出,无需清理")
self.browser_process = None
return
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
try:
if self.system == "Windows":
# 先尝试正常终止
process.terminate()
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
utils.logger.warning("[BrowserLauncher] 正常终止超时使用taskkill强制结束")
subprocess.run(
["taskkill", "/F", "/T", "/PID", str(process.pid)],
capture_output=True,
check=False,
)
process.wait(timeout=5)
else:
pgid = os.getpgid(process.pid)
try:
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
utils.logger.info("[BrowserLauncher] 浏览器进程组不存在,可能已退出")
else:
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
utils.logger.warning("[BrowserLauncher] 优雅关闭超时发送SIGKILL")
os.killpg(pgid, signal.SIGKILL)
process.wait(timeout=5)
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
except Exception as e:
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
finally:
self.browser_process = None