mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
Compare commits
9 Commits
7b9db2f748
...
e6f3182ed7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e6f3182ed7 | ||
|
|
2cf143cc7c | ||
|
|
eb625b0b48 | ||
|
|
84f6f650f8 | ||
|
|
9d6cf065e9 | ||
|
|
95c740dee2 | ||
|
|
f97e0c18cd | ||
|
|
879a72ea30 | ||
|
|
3237073a0e |
@@ -1 +1 @@
|
||||
3.9
|
||||
3.11
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️
|
||||
|
||||
<div align="center" markdown="1">
|
||||
<sup>Special thanks to:</sup>
|
||||
<br>
|
||||
@@ -12,8 +14,6 @@
|
||||
</div>
|
||||
<hr>
|
||||
|
||||
# 🔥 MediaCrawler - 自媒体平台爬虫 🕷️
|
||||
|
||||
<div align="center">
|
||||
|
||||
<a href="https://trendshift.io/repositories/8291" target="_blank">
|
||||
|
||||
295
cmd_arg/arg.py
295
cmd_arg/arg.py
@@ -1,60 +1,257 @@
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
|
||||
import argparse
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
import sys
|
||||
from enum import Enum
|
||||
from types import SimpleNamespace
|
||||
from typing import Iterable, Optional, Sequence, Type, TypeVar
|
||||
|
||||
import typer
|
||||
from typing_extensions import Annotated
|
||||
|
||||
import config
|
||||
from tools.utils import str2bool
|
||||
|
||||
|
||||
async def parse_cmd():
|
||||
# 读取command arg
|
||||
parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
|
||||
parser.add_argument('--platform', type=str,
|
||||
help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
|
||||
choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
|
||||
parser.add_argument('--lt', type=str,
|
||||
help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
|
||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||
parser.add_argument('--type', type=str,
|
||||
help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
|
||||
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
||||
parser.add_argument('--start', type=int,
|
||||
help='Number of start page / 起始页码', default=config.START_PAGE)
|
||||
parser.add_argument('--keywords', type=str,
|
||||
help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
|
||||
parser.add_argument('--get_comment', type=str2bool,
|
||||
help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
|
||||
parser.add_argument('--get_sub_comment', type=str2bool,
|
||||
help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
|
||||
parser.add_argument('--save_data_option', type=str,
|
||||
help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)',
|
||||
choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
|
||||
parser.add_argument('--init_db', type=str,
|
||||
help='Initialize database schema / 初始化数据库表结构 (sqlite | mysql)',
|
||||
choices=['sqlite', 'mysql'], default=None)
|
||||
parser.add_argument('--cookies', type=str,
|
||||
help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
|
||||
EnumT = TypeVar("EnumT", bound=Enum)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# override config
|
||||
config.PLATFORM = args.platform
|
||||
config.LOGIN_TYPE = args.lt
|
||||
config.CRAWLER_TYPE = args.type
|
||||
config.START_PAGE = args.start
|
||||
config.KEYWORDS = args.keywords
|
||||
config.ENABLE_GET_COMMENTS = args.get_comment
|
||||
config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
|
||||
config.SAVE_DATA_OPTION = args.save_data_option
|
||||
config.COOKIES = args.cookies
|
||||
class PlatformEnum(str, Enum):
|
||||
"""支持的媒体平台枚举"""
|
||||
|
||||
return args
|
||||
XHS = "xhs"
|
||||
DOUYIN = "dy"
|
||||
KUAISHOU = "ks"
|
||||
BILIBILI = "bili"
|
||||
WEIBO = "wb"
|
||||
TIEBA = "tieba"
|
||||
ZHIHU = "zhihu"
|
||||
|
||||
|
||||
class LoginTypeEnum(str, Enum):
|
||||
"""登录方式枚举"""
|
||||
|
||||
QRCODE = "qrcode"
|
||||
PHONE = "phone"
|
||||
COOKIE = "cookie"
|
||||
|
||||
|
||||
class CrawlerTypeEnum(str, Enum):
|
||||
"""爬虫类型枚举"""
|
||||
|
||||
SEARCH = "search"
|
||||
DETAIL = "detail"
|
||||
CREATOR = "creator"
|
||||
|
||||
|
||||
class SaveDataOptionEnum(str, Enum):
|
||||
"""数据保存方式枚举"""
|
||||
|
||||
CSV = "csv"
|
||||
DB = "db"
|
||||
JSON = "json"
|
||||
SQLITE = "sqlite"
|
||||
|
||||
|
||||
class InitDbOptionEnum(str, Enum):
|
||||
"""数据库初始化选项"""
|
||||
|
||||
SQLITE = "sqlite"
|
||||
MYSQL = "mysql"
|
||||
|
||||
|
||||
def _to_bool(value: bool | str) -> bool:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
return str2bool(value)
|
||||
|
||||
|
||||
def _coerce_enum(
|
||||
enum_cls: Type[EnumT],
|
||||
value: EnumT | str,
|
||||
default: EnumT,
|
||||
) -> EnumT:
|
||||
"""Safely convert a raw config value to an enum member."""
|
||||
|
||||
if isinstance(value, enum_cls):
|
||||
return value
|
||||
|
||||
try:
|
||||
return enum_cls(value)
|
||||
except ValueError:
|
||||
typer.secho(
|
||||
f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内,已回退到默认值 '{default.value}'.",
|
||||
fg=typer.colors.YELLOW,
|
||||
)
|
||||
return default
|
||||
|
||||
|
||||
def _normalize_argv(argv: Optional[Sequence[str]]) -> Iterable[str]:
|
||||
if argv is None:
|
||||
return list(sys.argv[1:])
|
||||
return list(argv)
|
||||
|
||||
|
||||
def _inject_init_db_default(args: Sequence[str]) -> list[str]:
|
||||
"""Ensure bare --init_db defaults to sqlite for backward compatibility."""
|
||||
|
||||
normalized: list[str] = []
|
||||
i = 0
|
||||
while i < len(args):
|
||||
arg = args[i]
|
||||
normalized.append(arg)
|
||||
|
||||
if arg == "--init_db":
|
||||
next_arg = args[i + 1] if i + 1 < len(args) else None
|
||||
if not next_arg or next_arg.startswith("-"):
|
||||
normalized.append(InitDbOptionEnum.SQLITE.value)
|
||||
i += 1
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
"""使用 Typer 解析命令行参数。"""
|
||||
|
||||
app = typer.Typer(add_completion=False)
|
||||
|
||||
@app.callback(invoke_without_command=True)
|
||||
def main(
|
||||
platform: Annotated[
|
||||
PlatformEnum,
|
||||
typer.Option(
|
||||
"--platform",
|
||||
help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)",
|
||||
rich_help_panel="基础配置",
|
||||
),
|
||||
] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS),
|
||||
lt: Annotated[
|
||||
LoginTypeEnum,
|
||||
typer.Option(
|
||||
"--lt",
|
||||
help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)",
|
||||
rich_help_panel="账号配置",
|
||||
),
|
||||
] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE),
|
||||
crawler_type: Annotated[
|
||||
CrawlerTypeEnum,
|
||||
typer.Option(
|
||||
"--type",
|
||||
help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)",
|
||||
rich_help_panel="基础配置",
|
||||
),
|
||||
] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH),
|
||||
start: Annotated[
|
||||
int,
|
||||
typer.Option(
|
||||
"--start",
|
||||
help="起始页码",
|
||||
rich_help_panel="基础配置",
|
||||
),
|
||||
] = config.START_PAGE,
|
||||
keywords: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--keywords",
|
||||
help="请输入关键词,多个关键词用逗号分隔",
|
||||
rich_help_panel="基础配置",
|
||||
),
|
||||
] = config.KEYWORDS,
|
||||
get_comment: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--get_comment",
|
||||
help="是否爬取一级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
|
||||
rich_help_panel="评论配置",
|
||||
show_default=True,
|
||||
),
|
||||
] = str(config.ENABLE_GET_COMMENTS),
|
||||
get_sub_comment: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--get_sub_comment",
|
||||
help="是否爬取二级评论,支持 yes/true/t/y/1 或 no/false/f/n/0",
|
||||
rich_help_panel="评论配置",
|
||||
show_default=True,
|
||||
),
|
||||
] = str(config.ENABLE_GET_SUB_COMMENTS),
|
||||
save_data_option: Annotated[
|
||||
SaveDataOptionEnum,
|
||||
typer.Option(
|
||||
"--save_data_option",
|
||||
help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)",
|
||||
rich_help_panel="存储配置",
|
||||
),
|
||||
] = _coerce_enum(
|
||||
SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON
|
||||
),
|
||||
init_db: Annotated[
|
||||
Optional[InitDbOptionEnum],
|
||||
typer.Option(
|
||||
"--init_db",
|
||||
help="初始化数据库表结构 (sqlite | mysql)",
|
||||
rich_help_panel="存储配置",
|
||||
),
|
||||
] = None,
|
||||
cookies: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
"--cookies",
|
||||
help="Cookie 登录方式使用的 Cookie 值",
|
||||
rich_help_panel="账号配置",
|
||||
),
|
||||
] = config.COOKIES,
|
||||
) -> SimpleNamespace:
|
||||
"""MediaCrawler 命令行入口"""
|
||||
|
||||
enable_comment = _to_bool(get_comment)
|
||||
enable_sub_comment = _to_bool(get_sub_comment)
|
||||
init_db_value = init_db.value if init_db else None
|
||||
|
||||
# override global config
|
||||
config.PLATFORM = platform.value
|
||||
config.LOGIN_TYPE = lt.value
|
||||
config.CRAWLER_TYPE = crawler_type.value
|
||||
config.START_PAGE = start
|
||||
config.KEYWORDS = keywords
|
||||
config.ENABLE_GET_COMMENTS = enable_comment
|
||||
config.ENABLE_GET_SUB_COMMENTS = enable_sub_comment
|
||||
config.SAVE_DATA_OPTION = save_data_option.value
|
||||
config.COOKIES = cookies
|
||||
|
||||
return SimpleNamespace(
|
||||
platform=config.PLATFORM,
|
||||
lt=config.LOGIN_TYPE,
|
||||
type=config.CRAWLER_TYPE,
|
||||
start=config.START_PAGE,
|
||||
keywords=config.KEYWORDS,
|
||||
get_comment=config.ENABLE_GET_COMMENTS,
|
||||
get_sub_comment=config.ENABLE_GET_SUB_COMMENTS,
|
||||
save_data_option=config.SAVE_DATA_OPTION,
|
||||
init_db=init_db_value,
|
||||
cookies=config.COOKIES,
|
||||
)
|
||||
|
||||
command = typer.main.get_command(app)
|
||||
|
||||
cli_args = _normalize_argv(argv)
|
||||
cli_args = _inject_init_db_default(cli_args)
|
||||
|
||||
try:
|
||||
result = command.main(args=cli_args, standalone_mode=False)
|
||||
if isinstance(result, int): # help/options handled by Typer; propagate exit code
|
||||
raise SystemExit(result)
|
||||
return result
|
||||
except typer.Exit as exc: # pragma: no cover - CLI exit paths
|
||||
raise SystemExit(exc.exit_code) from exc
|
||||
|
||||
@@ -376,7 +376,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.ks_client.get_all_videos_by_creator(
|
||||
user_id=user_id,
|
||||
crawl_interval=random.random(),
|
||||
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
|
||||
callback=self.fetch_creator_video_detail,
|
||||
)
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ author = "程序员阿江-Relakkes <relakkes@gmail.com>"
|
||||
version = "0.1.0"
|
||||
description = "A social media crawler project, support Xiaohongshu, Weibo, Zhihu, Bilibili, Douyin, BaiduTieBa etc."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"aiofiles~=23.2.1",
|
||||
"aiomysql==0.2.0",
|
||||
@@ -29,6 +29,7 @@ dependencies = [
|
||||
"requests==2.32.3",
|
||||
"sqlalchemy>=2.0.43",
|
||||
"tenacity==8.2.2",
|
||||
"typer>=0.12.3",
|
||||
"uvicorn==0.29.0",
|
||||
"wordcloud==1.9.3",
|
||||
]
|
||||
|
||||
@@ -2,6 +2,7 @@ httpx==0.28.1
|
||||
Pillow==9.5.0
|
||||
playwright==1.45.0
|
||||
tenacity==8.2.2
|
||||
typer>=0.12.3
|
||||
opencv-python
|
||||
aiomysql==0.2.0
|
||||
redis~=4.6.0
|
||||
|
||||
@@ -14,6 +14,7 @@ import platform
|
||||
import subprocess
|
||||
import time
|
||||
import socket
|
||||
import signal
|
||||
from typing import Optional, List, Tuple
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
@@ -106,7 +107,7 @@ class BrowserLauncher:
|
||||
|
||||
raise RuntimeError(f"无法找到可用的端口,已尝试 {start_port} 到 {port-1}")
|
||||
|
||||
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
|
||||
def launch_browser(self, browser_path: str, debug_port: int, headless: bool = False,
|
||||
user_data_dir: Optional[str] = None) -> subprocess.Popen:
|
||||
"""
|
||||
启动浏览器进程
|
||||
@@ -169,7 +170,8 @@ class BrowserLauncher:
|
||||
stderr=subprocess.DEVNULL,
|
||||
preexec_fn=os.setsid # 创建新的进程组
|
||||
)
|
||||
|
||||
|
||||
self.browser_process = process
|
||||
return process
|
||||
|
||||
except Exception as e:
|
||||
@@ -230,20 +232,48 @@ class BrowserLauncher:
|
||||
"""
|
||||
清理资源,关闭浏览器进程
|
||||
"""
|
||||
if self.browser_process:
|
||||
try:
|
||||
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
|
||||
|
||||
if self.system == "Windows":
|
||||
# Windows下使用taskkill强制终止进程树
|
||||
subprocess.run(["taskkill", "/F", "/T", "/PID", str(self.browser_process.pid)],
|
||||
capture_output=True)
|
||||
if not self.browser_process:
|
||||
return
|
||||
|
||||
process = self.browser_process
|
||||
|
||||
if process.poll() is not None:
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已退出,无需清理")
|
||||
self.browser_process = None
|
||||
return
|
||||
|
||||
utils.logger.info("[BrowserLauncher] 正在关闭浏览器进程...")
|
||||
|
||||
try:
|
||||
if self.system == "Windows":
|
||||
# 先尝试正常终止
|
||||
process.terminate()
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
utils.logger.warning("[BrowserLauncher] 正常终止超时,使用taskkill强制结束")
|
||||
subprocess.run(
|
||||
["taskkill", "/F", "/T", "/PID", str(process.pid)],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
process.wait(timeout=5)
|
||||
else:
|
||||
pgid = os.getpgid(process.pid)
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程组不存在,可能已退出")
|
||||
else:
|
||||
# Unix系统下终止进程组
|
||||
os.killpg(os.getpgid(self.browser_process.pid), 9)
|
||||
|
||||
self.browser_process = None
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
|
||||
try:
|
||||
process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
utils.logger.warning("[BrowserLauncher] 优雅关闭超时,发送SIGKILL")
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
process.wait(timeout=5)
|
||||
|
||||
utils.logger.info("[BrowserLauncher] 浏览器进程已关闭")
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[BrowserLauncher] 关闭浏览器进程时出错: {e}")
|
||||
finally:
|
||||
self.browser_process = None
|
||||
|
||||
@@ -291,16 +291,28 @@ class CDPBrowserManager:
|
||||
"""
|
||||
try:
|
||||
# 关闭浏览器上下文
|
||||
# if self.browser_context:
|
||||
# await self.browser_context.close()
|
||||
# self.browser_context = None
|
||||
# utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
if self.browser_context:
|
||||
try:
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
|
||||
except Exception as context_error:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
|
||||
)
|
||||
finally:
|
||||
self.browser_context = None
|
||||
|
||||
# # 断开浏览器连接
|
||||
# if self.browser:
|
||||
# await self.browser.close()
|
||||
# self.browser = None
|
||||
# utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
# 断开浏览器连接
|
||||
if self.browser:
|
||||
try:
|
||||
await self.browser.close()
|
||||
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
|
||||
except Exception as browser_error:
|
||||
utils.logger.warning(
|
||||
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
|
||||
)
|
||||
finally:
|
||||
self.browser = None
|
||||
|
||||
# 关闭浏览器进程(如果配置为自动关闭)
|
||||
if config.AUTO_CLOSE_BROWSER:
|
||||
|
||||
Reference in New Issue
Block a user