Merge branch 'codex/replace-argparse-with-typer-for-cli'

fix: #730
Merge pull request #729 from NanmiCoder/codex/replace-argparse-with-typer-for-cli
2025-11-25 03:15:17 +08:00 · 2025-09-26 18:11:02 +08:00 · 2025-09-26 18:10:30 +08:00 · 2025-09-26 18:08:21 +08:00 · 2025-09-26 18:07:57 +08:00 · 2025-09-26 17:38:50 +08:00
9 changed files with 1033 additions and 1533 deletions
--- a/.python-version
+++ b/.python-version
@@ -1 +1 @@
-3.9
+3.11
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️
+
 <div align="center" markdown="1">
   <sup>Special thanks to:</sup>
   <br>
@@ -12,8 +14,6 @@
 </div>
 <hr>

-# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️
-
 <div align="center">

 <a href="https://trendshift.io/repositories/8291" target="_blank">
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@@ -1,60 +1,257 @@
-# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：  
-# 1. 不得用于任何商业用途。  
-# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。  
-# 3. 不得进行大规模爬取或对平台造成运营干扰。  
-# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。   
+# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
+# 1. 不得用于任何商业用途。
+# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
+# 3. 不得进行大规模爬取或对平台造成运营干扰。
+# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
 # 5. 不得用于任何非法或不当的用途。
-#   
-# 详细许可条款请参阅项目根目录下的LICENSE文件。  
-# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。  
+#
+# 详细许可条款请参阅项目根目录下的LICENSE文件。
+# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。


-import argparse
+from __future__ import annotations
+
+
+import sys
+from enum import Enum
+from types import SimpleNamespace
+from typing import Iterable, Optional, Sequence, Type, TypeVar
+
+import typer
+from typing_extensions import Annotated

 import config
 from tools.utils import str2bool


-async def parse_cmd():
-    # 读取command arg
-    parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
-    parser.add_argument('--platform', type=str, 
-                        help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
-                        choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
-    parser.add_argument('--lt', type=str, 
-                        help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
-                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
-    parser.add_argument('--type', type=str, 
-                        help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
-                        choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
-    parser.add_argument('--start', type=int,
-                        help='Number of start page / 起始页码', default=config.START_PAGE)
-    parser.add_argument('--keywords', type=str,
-                        help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
-    parser.add_argument('--get_comment', type=str2bool,
-                        help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
-    parser.add_argument('--get_sub_comment', type=str2bool,
-                        help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
-    parser.add_argument('--save_data_option', type=str,
-                        help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)', 
-                        choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
-    parser.add_argument('--init_db', type=str,
-                        help='Initialize database schema / 初始化数据库表结构 (sqlite | mysql)',
-                        choices=['sqlite', 'mysql'], default=None)
-    parser.add_argument('--cookies', type=str,
-                        help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
+EnumT = TypeVar("EnumT", bound=Enum)

-    args = parser.parse_args()

-    # override config
-    config.PLATFORM = args.platform
-    config.LOGIN_TYPE = args.lt
-    config.CRAWLER_TYPE = args.type
-    config.START_PAGE = args.start
-    config.KEYWORDS = args.keywords
-    config.ENABLE_GET_COMMENTS = args.get_comment
-    config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
-    config.SAVE_DATA_OPTION = args.save_data_option
-    config.COOKIES = args.cookies
+class PlatformEnum(str, Enum):
+    """支持的媒体平台枚举"""

-    return args
+    XHS = "xhs"
+    DOUYIN = "dy"
+    KUAISHOU = "ks"
+    BILIBILI = "bili"
+    WEIBO = "wb"
+    TIEBA = "tieba"
+    ZHIHU = "zhihu"
+
+
+class LoginTypeEnum(str, Enum):
+    """登录方式枚举"""
+
+    QRCODE = "qrcode"
+    PHONE = "phone"
+    COOKIE = "cookie"
+
+
+class CrawlerTypeEnum(str, Enum):
+    """爬虫类型枚举"""
+
+    SEARCH = "search"
+    DETAIL = "detail"
+    CREATOR = "creator"
+
+
+class SaveDataOptionEnum(str, Enum):
+    """数据保存方式枚举"""
+
+    CSV = "csv"
+    DB = "db"
+    JSON = "json"
+    SQLITE = "sqlite"
+
+
+class InitDbOptionEnum(str, Enum):
+    """数据库初始化选项"""
+
+    SQLITE = "sqlite"
+    MYSQL = "mysql"
+
+
+def _to_bool(value: bool | str) -> bool:
+    if isinstance(value, bool):
+        return value
+    return str2bool(value)
+
+
+def _coerce_enum(
+    enum_cls: Type[EnumT],
+    value: EnumT | str,
+    default: EnumT,
+) -> EnumT:
+    """Safely convert a raw config value to an enum member."""
+
+    if isinstance(value, enum_cls):
+        return value
+
+    try:
+        return enum_cls(value)
+    except ValueError:
+        typer.secho(
+            f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内，已回退到默认值 '{default.value}'.",
+            fg=typer.colors.YELLOW,
+        )
+        return default
+
+
+def _normalize_argv(argv: Optional[Sequence[str]]) -> Iterable[str]:
+    if argv is None:
+        return list(sys.argv[1:])
+    return list(argv)
+
+
+def _inject_init_db_default(args: Sequence[str]) -> list[str]:
+    """Ensure bare --init_db defaults to sqlite for backward compatibility."""
+
+    normalized: list[str] = []
+    i = 0
+    while i < len(args):
+        arg = args[i]
+        normalized.append(arg)
+
+        if arg == "--init_db":
+            next_arg = args[i + 1] if i + 1 < len(args) else None
+            if not next_arg or next_arg.startswith("-"):
+                normalized.append(InitDbOptionEnum.SQLITE.value)
+        i += 1
+
+    return normalized
+
+
+async def parse_cmd(argv: Optional[Sequence[str]] = None):
+    """使用 Typer 解析命令行参数。"""
+
+    app = typer.Typer(add_completion=False)
+
+    @app.callback(invoke_without_command=True)
+    def main(
+        platform: Annotated[
+            PlatformEnum,
+            typer.Option(
+                "--platform",
+                help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)",
+                rich_help_panel="基础配置",
+            ),
+        ] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS),
+        lt: Annotated[
+            LoginTypeEnum,
+            typer.Option(
+                "--lt",
+                help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)",
+                rich_help_panel="账号配置",
+            ),
+        ] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE),
+        crawler_type: Annotated[
+            CrawlerTypeEnum,
+            typer.Option(
+                "--type",
+                help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)",
+                rich_help_panel="基础配置",
+            ),
+        ] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH),
+        start: Annotated[
+            int,
+            typer.Option(
+                "--start",
+                help="起始页码",
+                rich_help_panel="基础配置",
+            ),
+        ] = config.START_PAGE,
+        keywords: Annotated[
+            str,
+            typer.Option(
+                "--keywords",
+                help="请输入关键词，多个关键词用逗号分隔",
+                rich_help_panel="基础配置",
+            ),
+        ] = config.KEYWORDS,
+        get_comment: Annotated[
+            str,
+            typer.Option(
+                "--get_comment",
+                help="是否爬取一级评论，支持 yes/true/t/y/1 或 no/false/f/n/0",
+                rich_help_panel="评论配置",
+                show_default=True,
+            ),
+        ] = str(config.ENABLE_GET_COMMENTS),
+        get_sub_comment: Annotated[
+            str,
+            typer.Option(
+                "--get_sub_comment",
+                help="是否爬取二级评论，支持 yes/true/t/y/1 或 no/false/f/n/0",
+                rich_help_panel="评论配置",
+                show_default=True,
+            ),
+        ] = str(config.ENABLE_GET_SUB_COMMENTS),
+        save_data_option: Annotated[
+            SaveDataOptionEnum,
+            typer.Option(
+                "--save_data_option",
+                help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)",
+                rich_help_panel="存储配置",
+            ),
+        ] = _coerce_enum(
+            SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
+        ),
+        init_db: Annotated[
+            Optional[InitDbOptionEnum],
+            typer.Option(
+                "--init_db",
+                help="初始化数据库表结构 (sqlite | mysql)",
+                rich_help_panel="存储配置",
+            ),
+        ] = None,
+        cookies: Annotated[
+            str,
+            typer.Option(
+                "--cookies",
+                help="Cookie 登录方式使用的 Cookie 值",
+                rich_help_panel="账号配置",
+            ),
+        ] = config.COOKIES,
+    ) -> SimpleNamespace:
+        """MediaCrawler 命令行入口"""
+
+        enable_comment = _to_bool(get_comment)
+        enable_sub_comment = _to_bool(get_sub_comment)
+        init_db_value = init_db.value if init_db else None
+
+        # override global config
+        config.PLATFORM = platform.value
+        config.LOGIN_TYPE = lt.value
+        config.CRAWLER_TYPE = crawler_type.value
+        config.START_PAGE = start
+        config.KEYWORDS = keywords
+        config.ENABLE_GET_COMMENTS = enable_comment
+        config.ENABLE_GET_SUB_COMMENTS = enable_sub_comment
+        config.SAVE_DATA_OPTION = save_data_option.value
+        config.COOKIES = cookies
+
+        return SimpleNamespace(
+            platform=config.PLATFORM,
+            lt=config.LOGIN_TYPE,
+            type=config.CRAWLER_TYPE,
+            start=config.START_PAGE,
+            keywords=config.KEYWORDS,
+            get_comment=config.ENABLE_GET_COMMENTS,
+            get_sub_comment=config.ENABLE_GET_SUB_COMMENTS,
+            save_data_option=config.SAVE_DATA_OPTION,
+            init_db=init_db_value,
+            cookies=config.COOKIES,
+        )
+
+    command = typer.main.get_command(app)
+
+    cli_args = _normalize_argv(argv)
+    cli_args = _inject_init_db_default(cli_args)
+
+    try:
+        result = command.main(args=cli_args, standalone_mode=False)
+        if isinstance(result, int):  # help/options handled by Typer; propagate exit code
+            raise SystemExit(result)
+        return result
+    except typer.Exit as exc:  # pragma: no cover - CLI exit paths
+        raise SystemExit(exc.exit_code) from exc
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@@ -376,7 +376,7 @@ class KuaishouCrawler(AbstractCrawler):
            # Get all video information of the creator
            all_video_list = await self.ks_client.get_all_videos_by_creator(
                user_id=user_id,
-                crawl_interval=random.random(),
+                crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                callback=self.fetch_creator_video_detail,
            )

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ author = "程序员阿江-Relakkes <relakkes@gmail.com>"
 version = "0.1.0"
 description = "A social media crawler project, support Xiaohongshu, Weibo, Zhihu, Bilibili, Douyin, BaiduTieBa etc."
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.11"
 dependencies = [
    "aiofiles~=23.2.1",
    "aiomysql==0.2.0",
@@ -29,6 +29,7 @@ dependencies = [
    "requests==2.32.3",
    "sqlalchemy>=2.0.43",
    "tenacity==8.2.2",
+    "typer>=0.12.3",
    "uvicorn==0.29.0",
    "wordcloud==1.9.3",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ httpx==0.28.1
 Pillow==9.5.0
 playwright==1.45.0
 tenacity==8.2.2
+typer>=0.12.3
 opencv-python
 aiomysql==0.2.0
 redis~=4.6.0
--- a/tools/browser_launcher.py
+++ b/tools/browser_launcher.py
@@ -14,6 +14,7 @@ import platform
 import subprocess
 import time
 import socket
+import signal
 from typing import Optional, List, Tuple
 import asyncio
 from pathlib import Path
@@ -106,7 +107,7 @@ class BrowserLauncher:
        
        raise RuntimeError(f"无法找到可用的端口，已尝试 {start_port} 到 {port-1}")
    
-    def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False, 
+    def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
                      user_data_dir: Optional[str] = None) -> subprocess.Popen:
        """
        启动浏览器进程
@@ -169,7 +170,8 @@ class BrowserLauncher:
                    stderr=subprocess.DEVNULL,
                    preexec_fn=os.setsid  # 创建新的进程组
                )
-            
+
+            self.browser_process = process
            return process
            
        except Exception as e:
@@ -230,20 +232,48 @@ class BrowserLauncher:
        """
        清理资源，关闭浏览器进程
        """
-        if self.browser_process:
-            try:
-                utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
-                
-                if self.system == "Windows":
-                    # Windows下使用taskkill强制终止进程树
-                    subprocess.run(["taskkill", "/F", "/T", "/PID", str(self.browser_process.pid)], 
-                                 capture_output=True)
+        if not self.browser_process:
+            return
+
+        process = self.browser_process
+
+        if process.poll() is not None:
+            utils.logger.info("[BrowserLauncher] 浏览器进程已退出，无需清理")
+            self.browser_process = None
+            return
+
+        utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
+
+        try:
+            if self.system == "Windows":
+                # 先尝试正常终止
+                process.terminate()
+                try:
+                    process.wait(timeout=5)
+                except subprocess.TimeoutExpired:
+                    utils.logger.warning("[BrowserLauncher] 正常终止超时，使用taskkill强制结束")
+                    subprocess.run(
+                        ["taskkill", "/F", "/T", "/PID", str(process.pid)],
+                        capture_output=True,
+                        check=False,
+                    )
+                    process.wait(timeout=5)
+            else:
+                pgid = os.getpgid(process.pid)
+                try:
+                    os.killpg(pgid, signal.SIGTERM)
+                except ProcessLookupError:
+                    utils.logger.info("[BrowserLauncher] 浏览器进程组不存在，可能已退出")
                else:
-                    # Unix系统下终止进程组
-                    os.killpg(os.getpgid(self.browser_process.pid), 9)
-                
-                self.browser_process = None
-                utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
-                
-            except Exception as e:
-                utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
+                    try:
+                        process.wait(timeout=5)
+                    except subprocess.TimeoutExpired:
+                        utils.logger.warning("[BrowserLauncher] 优雅关闭超时，发送SIGKILL")
+                        os.killpg(pgid, signal.SIGKILL)
+                        process.wait(timeout=5)
+
+            utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
+        except Exception as e:
+            utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
+        finally:
+            self.browser_process = None
--- a/tools/cdp_browser.py
+++ b/tools/cdp_browser.py
@@ -291,16 +291,28 @@ class CDPBrowserManager:
        """
        try:
            # 关闭浏览器上下文
-            # if self.browser_context:
-            #     await self.browser_context.close()
-            #     self.browser_context = None
-            #     utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
+            if self.browser_context:
+                try:
+                    await self.browser_context.close()
+                    utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
+                except Exception as context_error:
+                    utils.logger.warning(
+                        f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
+                    )
+                finally:
+                    self.browser_context = None

-            # # 断开浏览器连接
-            # if self.browser:
-            #     await self.browser.close()
-            #     self.browser = None
-            #     utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
+            # 断开浏览器连接
+            if self.browser:
+                try:
+                    await self.browser.close()
+                    utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
+                except Exception as browser_error:
+                    utils.logger.warning(
+                        f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
+                    )
+                finally:
+                    self.browser = None

            # 关闭浏览器进程（如果配置为自动关闭）
            if config.AUTO_CLOSE_BROWSER:
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
程序员阿江(Relakkes)	e6f3182ed7	Merge branch 'codex/replace-argparse-with-typer-for-cli'	2025-09-26 18:11:02 +08:00
程序员阿江(Relakkes)	2cf143cc7c	fix: #730	2025-09-26 18:10:30 +08:00
程序员阿江-Relakkes	eb625b0b48	Merge pull request #729 from NanmiCoder/codex/replace-argparse-with-typer-for-cli feat(cli): migrate CLI argument parsing to Typer	2025-09-26 18:08:21 +08:00
程序员阿江(Relakkes)	84f6f650f8	fix: typer args bugfix	2025-09-26 18:07:57 +08:00
程序员阿江-Relakkes	9d6cf065e9	fix(cli): support runtime without peps604	2025-09-26 17:38:50 +08:00
程序员阿江-Relakkes	95c740dee2	refine: harden typer cli defaults	2025-09-26 17:38:44 +08:00
程序员阿江-Relakkes	f97e0c18cd	feat(cli): migrate argument parsing to typer	2025-09-26 17:21:47 +08:00
程序员阿江-Relakkes	879a72ea30	fix: 修复cdp启动的浏览器无法关闭的bug Improve BrowserLauncher shutdown reliability	2025-09-26 16:57:48 +08:00
程序员阿江-Relakkes	3237073a0e	Improve BrowserLauncher cleanup handling	2025-09-26 16:52:38 +08:00
@@ -1 +1 @@
 .9
 .11