22 Commits

Author SHA1 Message Date
程序员阿江(Relakkes)
5288bddb42 refactor: weibo search #771 2025-11-17 17:24:47 +08:00
程序员阿江(Relakkes)
6dcfd7e0a5 refactor: weibo login 2025-11-17 17:11:35 +08:00
程序员阿江(Relakkes)
e89a6d5781 feat: cdp browser cleanup after crawler done 2025-11-17 12:21:53 +08:00
程序员阿江(Relakkes)
a1c5e07df8 fix: xhs sub comment bugfix #769 2025-11-17 11:47:33 +08:00
程序员阿江(Relakkes)
b6caa7a85e refactor: add xhs creator params 2025-11-10 21:10:03 +08:00
程序员阿江(Relakkes)
1e3637f238 refactor: update xhs note detail 2025-11-10 18:13:51 +08:00
程序员阿江(Relakkes)
b5dab6d1e8 refactor: 使用 xhshow 替代 playwright 签名方案
感谢 @Cloxl/xhshow 开源项目

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-10 18:12:45 +08:00
程序员阿江-Relakkes
54f23b8d1c Merge pull request #768 from yangtao210/main
优化mongodb配置获取逻辑,移动存储基类位置。集成测试
2025-11-07 05:44:07 -05:00
yangtao210
58eb89f073 Merge branch 'NanmiCoder:main' into main 2025-11-07 17:44:09 +08:00
yt210
7888f4c6bd 优化mongodb配置获取逻辑,移动存储基类位置。集成测试 2025-11-07 17:42:50 +08:00
yt210
b61ec54a72 优化mongodb配置获取逻辑,移动存储基类位置。 2025-11-07 17:42:28 +08:00
程序员阿江(Relakkes)
60cbb3e37d fix: weibo container error #568 2025-11-06 19:43:09 +08:00
程序员阿江-Relakkes
05a1782746 Merge pull request #764 from yangtao210/main
新增存储到mongoDB
2025-11-06 06:10:49 -05:00
yt210
ef6948b305 新增存储到mongoDB 2025-11-06 10:40:30 +08:00
程序员阿江(Relakkes)
45ec4b433a docs: update 2025-11-06 00:08:03 +08:00
程序员阿江(Relakkes)
0074e975dd fix: dy search 2025-11-04 00:14:16 +08:00
程序员阿江(Relakkes)
889fa01466 fix: bili词云图修复 2025-11-02 13:25:31 +08:00
程序员阿江(Relakkes)
3f5925e326 feat: update xhs sign 2025-10-27 19:06:07 +08:00
程序员阿江(Relakkes)
ed6e0bfb5f refactor: tieba 改为浏览器获取数据 2025-10-19 17:09:55 +08:00
程序员阿江(Relakkes)
26a261bc09 Merge branch 'feature/config-refactor-20251018' 2025-10-19 15:32:42 +08:00
程序员阿江(Relakkes)
03e384bbe2 refactor: cdp模式下移除stealth注入 2025-10-19 15:32:03 +08:00
程序员阿江-Relakkes
56bf5d226f The configuration file supports URL crawling
Feature/config refactor 20251018
2025-10-18 07:42:14 +08:00
48 changed files with 2040 additions and 300 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -314,6 +314,7 @@ Nstbrowser 指纹浏览器 — 多账号运营&自动化管理的最佳解决方
## 📚 参考 ## 📚 参考
- **小红书签名仓库**[Cloxl 的 xhs 签名仓库](https://github.com/Cloxl/xhshow)
- **小红书客户端**[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs) - **小红书客户端**[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs)
- **短信转发**[SmsForwarder 参考仓库](https://github.com/pppscn/SmsForwarder) - **短信转发**[SmsForwarder 参考仓库](https://github.com/pppscn/SmsForwarder)
- **内网穿透工具**[ngrok 官方文档](https://ngrok.com/docs/) - **内网穿透工具**[ngrok 官方文档](https://ngrok.com/docs/)

View File

@@ -55,7 +55,7 @@ CUSTOM_BROWSER_PATH = ""
CDP_HEADLESS = False CDP_HEADLESS = False
# 浏览器启动超时时间(秒) # 浏览器启动超时时间(秒)
BROWSER_LAUNCH_TIMEOUT = 30 BROWSER_LAUNCH_TIMEOUT = 60
# 是否在程序结束时自动关闭浏览器 # 是否在程序结束时自动关闭浏览器
# 设置为False可以保持浏览器运行便于调试 # 设置为False可以保持浏览器运行便于调试

View File

@@ -42,4 +42,19 @@ SQLITE_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "datab
sqlite_db_config = { sqlite_db_config = {
"db_path": SQLITE_DB_PATH "db_path": SQLITE_DB_PATH
}
# mongodb config
MONGODB_HOST = os.getenv("MONGODB_HOST", "localhost")
MONGODB_PORT = os.getenv("MONGODB_PORT", 27017)
MONGODB_USER = os.getenv("MONGODB_USER", "")
MONGODB_PWD = os.getenv("MONGODB_PWD", "")
MONGODB_DB_NAME = os.getenv("MONGODB_DB_NAME", "media_crawler")
mongodb_config = {
"host": MONGODB_HOST,
"port": int(MONGODB_PORT),
"user": MONGODB_USER,
"password": MONGODB_PWD,
"db_name": MONGODB_DB_NAME,
} }

View File

@@ -12,7 +12,7 @@
# 微博平台配置 # 微博平台配置
# 搜索类型具体的枚举值在media_platform/weibo/field.py中 # 搜索类型具体的枚举值在media_platform/weibo/field.py中
WEIBO_SEARCH_TYPE = "popular" WEIBO_SEARCH_TYPE = "default"
# 指定微博ID列表 # 指定微博ID列表
WEIBO_SPECIFIED_ID_LIST = [ WEIBO_SPECIFIED_ID_LIST = [

View File

@@ -17,16 +17,13 @@ SORT_TYPE = "popularity_descending"
# 指定笔记URL列表, 必须要携带xsec_token参数 # 指定笔记URL列表, 必须要携带xsec_token参数
XHS_SPECIFIED_NOTE_URL_LIST = [ XHS_SPECIFIED_NOTE_URL_LIST = [
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search" "https://www.xiaohongshu.com/explore/64b95d01000000000c034587?xsec_token=AB0EFqJvINCkj6xOCKCQgfNNh8GdnBC_6XecG4QOddo3Q=&xsec_source=pc_cfeed"
# ........................ # ........................
] ]
# 指定创作者URL列表 (支持完整URL或纯ID) # 指定创作者URL列表需要携带xsec_token和xsec_source参数
# 支持格式:
# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
# 2. 纯user_id: "63e36c9a000000002703502b"
XHS_CREATOR_ID_LIST = [ XHS_CREATOR_ID_LIST = [
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed", "https://www.xiaohongshu.com/user/profile/5f58bd990000000001003753?xsec_token=ABYVg1evluJZZzpMX-VWzchxQ1qSNVW3r-jOEnKqMcgZw=&xsec_source=pc_search"
"63e36c9a000000002703502b",
# ........................ # ........................
] ]

View File

@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
"""MongoDB存储基类提供连接管理和通用存储方法"""
import asyncio
from typing import Dict, List, Optional
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection
from config import db_config
from tools import utils
class MongoDBConnection:
"""MongoDB连接管理单例模式"""
_instance = None
_client: Optional[AsyncIOMotorClient] = None
_db: Optional[AsyncIOMotorDatabase] = None
_lock = asyncio.Lock()
def __new__(cls):
if cls._instance is None:
cls._instance = super(MongoDBConnection, cls).__new__(cls)
return cls._instance
async def get_client(self) -> AsyncIOMotorClient:
"""获取客户端"""
if self._client is None:
async with self._lock:
if self._client is None:
await self._connect()
return self._client
async def get_db(self) -> AsyncIOMotorDatabase:
"""获取数据库"""
if self._db is None:
async with self._lock:
if self._db is None:
await self._connect()
return self._db
async def _connect(self):
"""建立连接"""
try:
mongo_config = db_config.mongodb_config
host = mongo_config["host"]
port = mongo_config["port"]
user = mongo_config["user"]
password = mongo_config["password"]
db_name = mongo_config["db_name"]
# 构建连接URL有认证/无认证)
if user and password:
connection_url = f"mongodb://{user}:{password}@{host}:{port}/"
else:
connection_url = f"mongodb://{host}:{port}/"
self._client = AsyncIOMotorClient(connection_url, serverSelectionTimeoutMS=5000)
await self._client.server_info() # 测试连接
self._db = self._client[db_name]
utils.logger.info(f"[MongoDBConnection] Connected to {host}:{port}/{db_name}")
except Exception as e:
utils.logger.error(f"[MongoDBConnection] Connection failed: {e}")
raise
async def close(self):
"""关闭连接"""
if self._client is not None:
self._client.close()
self._client = None
self._db = None
utils.logger.info("[MongoDBConnection] Connection closed")
class MongoDBStoreBase:
"""MongoDB存储基类提供通用的CRUD操作"""
def __init__(self, collection_prefix: str):
"""初始化存储基类
Args:
collection_prefix: 平台前缀xhs/douyin/bilibili等
"""
self.collection_prefix = collection_prefix
self._connection = MongoDBConnection()
async def get_collection(self, collection_suffix: str) -> AsyncIOMotorCollection:
"""获取集合:{prefix}_{suffix}"""
db = await self._connection.get_db()
collection_name = f"{self.collection_prefix}_{collection_suffix}"
return db[collection_name]
async def save_or_update(self, collection_suffix: str, query: Dict, data: Dict) -> bool:
"""保存或更新数据upsert"""
try:
collection = await self.get_collection(collection_suffix)
await collection.update_one(query, {"$set": data}, upsert=True)
return True
except Exception as e:
utils.logger.error(f"[MongoDBStoreBase] Save failed ({self.collection_prefix}_{collection_suffix}): {e}")
return False
async def find_one(self, collection_suffix: str, query: Dict) -> Optional[Dict]:
"""查询单条数据"""
try:
collection = await self.get_collection(collection_suffix)
return await collection.find_one(query)
except Exception as e:
utils.logger.error(f"[MongoDBStoreBase] Find one failed ({self.collection_prefix}_{collection_suffix}): {e}")
return None
async def find_many(self, collection_suffix: str, query: Dict, limit: int = 0) -> List[Dict]:
"""查询多条数据limit=0表示不限制"""
try:
collection = await self.get_collection(collection_suffix)
cursor = collection.find(query)
if limit > 0:
cursor = cursor.limit(limit)
return await cursor.to_list(length=None)
except Exception as e:
utils.logger.error(f"[MongoDBStoreBase] Find many failed ({self.collection_prefix}_{collection_suffix}): {e}")
return []
async def create_index(self, collection_suffix: str, keys: List[tuple], unique: bool = False):
"""创建索引keys=[("field", 1)]"""
try:
collection = await self.get_collection(collection_suffix)
await collection.create_index(keys, unique=unique)
utils.logger.info(f"[MongoDBStoreBase] Index created on {self.collection_prefix}_{collection_suffix}")
except Exception as e:
utils.logger.error(f"[MongoDBStoreBase] Create index failed: {e}")

View File

@@ -59,7 +59,6 @@ export default defineConfig({
text: 'MediaCrawler源码剖析课', text: 'MediaCrawler源码剖析课',
link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh' link: 'https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh'
}, },
{text: '知识星球文章专栏', link: '/知识星球介绍'},
{text: '开发者咨询服务', link: '/开发者咨询'}, {text: '开发者咨询服务', link: '/开发者咨询'},
] ]
}, },

View File

@@ -1,12 +1,12 @@
# 关于作者 # 关于作者
> 大家都叫我阿江,网名:程序员阿江-Relakkes目前裸辞正探索自由职业,希望能靠自己的技术能力和努力,实现自己理想的生活方式 > 大家都叫我阿江,网名:程序员阿江-Relakkes目前是一名独立开发者,专注于 AI Agent 和爬虫相关的开发工作All in AI
>
> 我身边有大量的技术人脉资源,如果大家有一些爬虫咨询或者编程单子可以向我丢过来
- [Github万星开源自媒体爬虫仓库MediaCrawler作者](https://github.com/NanmiCoder/MediaCrawler) - [Github万星开源自媒体爬虫仓库MediaCrawler作者](https://github.com/NanmiCoder/MediaCrawler)
- 全栈程序员熟悉Python、Golang、JavaScript工作中主要用Golang。 - 全栈程序员熟悉Python、Golang、JavaScript工作中主要用Golang。
- 曾经主导并参与过百万级爬虫采集系统架构设计与编码 - 曾经主导并参与过百万级爬虫采集系统架构设计与编码
- 爬虫是一种技术兴趣爱好,参与爬虫有一种对抗的感觉,越难越兴奋。 - 爬虫是一种技术兴趣爱好,参与爬虫有一种对抗的感觉,越难越兴奋。
- 目前专注于 AI Agent 领域,积极探索 AI 技术的应用与创新
- 如果你有 AI Agent 相关的项目需要合作,欢迎联系我,我有很多时间可以投入
## 微信联系方式 ## 微信联系方式
![relakkes_weichat.JPG](static/images/relakkes_weichat.jpg) ![relakkes_weichat.JPG](static/images/relakkes_weichat.jpg)

View File

@@ -15,5 +15,3 @@
## MediaCrawler源码剖析视频课程 ## MediaCrawler源码剖析视频课程
[mediacrawler源码课程介绍](https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh) [mediacrawler源码课程介绍](https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh)
## 知识星球爬虫逆向、编程专栏
[知识星球专栏介绍](知识星球介绍.md)

View File

@@ -1,31 +0,0 @@
# 知识星球专栏
## 基本介绍
文章:
- 1.爬虫JS逆向案例分享
- 2.MediaCrawler技术实现分享。
- 3.沉淀python开发经验和技巧
- ......................
提问:
- 4.在星球内向我提问关于MediaCrawler、爬虫、编程任何问题
## 章节内容
- [逆向案例 - 某16x8平台商品列表接口逆向参数分析](https://articles.zsxq.com/id_x1qmtg8pzld9.html)
- [逆向案例 - Product Hunt月度最佳产品榜单接口加密参数分析](https://articles.zsxq.com/id_au4eich3x2sg.html)
- [逆向案例 - 某zhi乎x-zse-96参数分析过程](https://articles.zsxq.com/id_dui2vil0ag1l.html)
- [逆向案例 - 某x识星球X-Signature加密参数分析过程](https://articles.zsxq.com/id_pp4madwcwcg8.html)
- [【独创】使用Playwright获取某音a_bogus参数流程包含加密参数分析](https://articles.zsxq.com/id_u89al50jk9x0.html)
- [【独创】使用Playwright低成本获取某书X-s参数流程分析当年的回忆录](https://articles.zsxq.com/id_u4lcrvqakuc7.html)
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
- [一次Mysql数据库中混用collation排序规则带来的bug](https://articles.zsxq.com/id_pibwr1wnst2p.html)
- [错误使用 Python 可变类型带来的隐藏 Bug](https://articles.zsxq.com/id_f7vn89l1d303.html)
- [【MediaCrawler】微博帖子评论爬虫教程](https://articles.zsxq.com/id_vrmuhw0ovj3t.html)
- [Python协程在并发场景下的幂等性问题](https://articles.zsxq.com/id_wocdwsfmfcmp.html)
- ........................................
## 加入星球
![星球qrcode.JPG](static/images/星球qrcode.jpg)

71
main.py
View File

@@ -11,6 +11,7 @@
import asyncio import asyncio
import sys import sys
import signal
from typing import Optional from typing import Optional
import cmd_arg import cmd_arg
@@ -24,6 +25,8 @@ from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler from media_platform.xhs import XiaoHongShuCrawler
from media_platform.zhihu import ZhihuCrawler from media_platform.zhihu import ZhihuCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var
class CrawlerFactory: class CrawlerFactory:
@@ -72,17 +75,75 @@ async def main():
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start() await crawler.start()
# Generate wordcloud after crawling is complete
# Only for JSON save mode
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
try:
file_writer = AsyncFileWriter(
platform=config.PLATFORM,
crawler_type=crawler_type_var.get()
)
await file_writer.generate_wordcloud_from_comments()
except Exception as e:
print(f"Error generating wordcloud: {e}")
async def async_cleanup():
"""异步清理函数用于处理CDP浏览器等异步资源"""
global crawler
if crawler:
# 检查并清理CDP浏览器
if hasattr(crawler, 'cdp_manager') and crawler.cdp_manager:
try:
await crawler.cdp_manager.cleanup(force=True) # 强制清理浏览器进程
except Exception as e:
# 只在非预期错误时打印
error_msg = str(e).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
print(f"[Main] 清理CDP浏览器时出错: {e}")
# 检查并清理标准浏览器上下文仅在非CDP模式下
elif hasattr(crawler, 'browser_context') and crawler.browser_context:
try:
# 检查上下文是否仍然打开
if hasattr(crawler.browser_context, 'pages'):
await crawler.browser_context.close()
except Exception as e:
# 只在非预期错误时打印
error_msg = str(e).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
print(f"[Main] 关闭浏览器上下文时出错: {e}")
# 关闭数据库连接
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
await db.close()
def cleanup(): def cleanup():
if crawler: """同步清理函数"""
# asyncio.run(crawler.close()) try:
pass # 创建新的事件循环来执行异步清理
if config.SAVE_DATA_OPTION in ["db", "sqlite"]: loop = asyncio.new_event_loop()
asyncio.run(db.close()) asyncio.set_event_loop(loop)
loop.run_until_complete(async_cleanup())
loop.close()
except Exception as e:
print(f"[Main] 清理时出错: {e}")
def signal_handler(signum, _frame):
"""信号处理器处理Ctrl+C等中断信号"""
print(f"\n[Main] 收到中断信号 {signum},正在清理资源...")
cleanup()
sys.exit(0)
if __name__ == "__main__": if __name__ == "__main__":
# 注册信号处理器
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
signal.signal(signal.SIGTERM, signal_handler) # 终止信号
try: try:
asyncio.get_event_loop().run_until_complete(main()) asyncio.get_event_loop().run_until_complete(main())
except KeyboardInterrupt:
print("\n[Main] 收到键盘中断,正在清理资源...")
finally: finally:
cleanup() cleanup()

View File

@@ -78,8 +78,9 @@ class BilibiliCrawler(AbstractCrawler):
# Launch a browser context. # Launch a browser context.
chromium = playwright.chromium chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS) self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url) await self.context_page.goto(self.index_url)
@@ -496,11 +497,12 @@ class BilibiliCrawler(AbstractCrawler):
"height": 1080 "height": 1080
}, },
user_agent=user_agent, user_agent=user_agent,
channel="chrome", # 使用系统的Chrome稳定版
) )
return browser_context return browser_context
else: else:
# type: ignore # type: ignore
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome")
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent) browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
return browser_context return browser_context

View File

@@ -91,8 +91,10 @@ class DouYinClient(AbstractApiClient):
post_data = {} post_data = {}
if request_method == "POST": if request_method == "POST":
post_data = params post_data = params
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus if "/v1/web/general/search" not in uri:
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs): async def request(self, method, url, **kwargs):
async with httpx.AsyncClient(proxy=self.proxy) as client: async with httpx.AsyncClient(proxy=self.proxy) as client:

View File

@@ -74,8 +74,9 @@ class DouYinCrawler(AbstractCrawler):
user_agent=None, user_agent=None,
headless=config.HEADLESS, headless=config.HEADLESS,
) )
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url) await self.context_page.goto(self.index_url)

View File

@@ -78,8 +78,10 @@ class KuaishouCrawler(AbstractCrawler):
self.browser_context = await self.launch_browser( self.browser_context = await self.launch_browser(
chromium, None, self.user_agent, headless=config.HEADLESS chromium, None, self.user_agent, headless=config.HEADLESS
) )
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto(f"{self.index_url}?isHome=1") await self.context_page.goto(f"{self.index_url}?isHome=1")
@@ -331,10 +333,11 @@ class KuaishouCrawler(AbstractCrawler):
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
user_agent=user_agent, user_agent=user_agent,
channel="chrome", # 使用系统的Chrome稳定版
) )
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
browser_context = await browser.new_context( browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080}, user_agent=user_agent viewport={"width": 1920, "height": 1080}, user_agent=user_agent
) )

View File

@@ -11,10 +11,10 @@
import asyncio import asyncio
import json import json
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode from urllib.parse import urlencode, quote
import httpx import requests
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext, Page
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
import config import config
@@ -34,34 +34,76 @@ class BaiduTieBaClient(AbstractApiClient):
timeout=10, timeout=10,
ip_pool=None, ip_pool=None,
default_ip_proxy=None, default_ip_proxy=None,
headers: Dict[str, str] = None,
playwright_page: Optional[Page] = None,
): ):
self.ip_pool: Optional[ProxyIpPool] = ip_pool self.ip_pool: Optional[ProxyIpPool] = ip_pool
self.timeout = timeout self.timeout = timeout
self.headers = { # 使用传入的headers(包含真实浏览器UA)或默认headers
self.headers = headers or {
"User-Agent": utils.get_user_agent(), "User-Agent": utils.get_user_agent(),
"Cookies": "", "Cookie": "",
} }
self._host = "https://tieba.baidu.com" self._host = "https://tieba.baidu.com"
self._page_extractor = TieBaExtractor() self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy self.default_ip_proxy = default_ip_proxy
self.playwright_page = playwright_page # Playwright页面对象
def _sync_request(self, method, url, proxy=None, **kwargs):
"""
同步的requests请求方法
Args:
method: 请求方法
url: 请求的URL
proxy: 代理IP
**kwargs: 其他请求参数
Returns:
response对象
"""
# 构造代理字典
proxies = None
if proxy:
proxies = {
"http": proxy,
"https": proxy,
}
# 发送请求
response = requests.request(
method=method,
url=url,
headers=self.headers,
proxies=proxies,
timeout=self.timeout,
**kwargs
)
return response
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]: async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
""" """
封装httpx的公共请求方法,对请求响应做一些处理 封装requests的公共请求方法,对请求响应做一些处理
Args: Args:
method: 请求方法 method: 请求方法
url: 请求的URL url: 请求的URL
return_ori_content: 是否返回原始内容 return_ori_content: 是否返回原始内容
proxies: 代理IP proxy: 代理IP
**kwargs: 其他请求参数,例如请求头、请求体等 **kwargs: 其他请求参数,例如请求头、请求体等
Returns: Returns:
""" """
actual_proxy = proxy if proxy else self.default_ip_proxy actual_proxy = proxy if proxy else self.default_ip_proxy
async with httpx.AsyncClient(proxy=actual_proxy) as client:
response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs) # 在线程池中执行同步的requests请求
response = await asyncio.to_thread(
self._sync_request,
method,
url,
actual_proxy,
**kwargs
)
if response.status_code != 200: if response.status_code != 200:
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
@@ -69,7 +111,7 @@ class BaiduTieBaClient(AbstractApiClient):
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
if response.text == "" or response.text == "blocked": if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}") utils.logger.error(f"request params incorrect, response.text: {response.text}")
raise Exception("account blocked") raise Exception("account blocked")
if return_ori_content: if return_ori_content:
@@ -119,26 +161,41 @@ class BaiduTieBaClient(AbstractApiClient):
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs) return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs)
async def pong(self) -> bool: async def pong(self, browser_context: BrowserContext = None) -> bool:
""" """
用于检查登录态是否失效了 用于检查登录态是否失效了
Returns: 使用Cookie检测而非API调用,避免被检测
Args:
browser_context: 浏览器上下文对象
Returns:
bool: True表示已登录,False表示未登录
""" """
utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...") utils.logger.info("[BaiduTieBaClient.pong] Begin to check tieba login state by cookies...")
if not browser_context:
utils.logger.warning("[BaiduTieBaClient.pong] browser_context is None, assume not logged in")
return False
try: try:
uri = "/mo/q/sync" # 从浏览器获取cookies并检查关键登录cookie
res: Dict = await self.get(uri) _, cookie_dict = utils.convert_cookies(await browser_context.cookies())
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
if res and res.get("no") == 0: # 百度贴吧的登录标识: STOKEN 或 PTOKEN
ping_flag = True stoken = cookie_dict.get("STOKEN")
ptoken = cookie_dict.get("PTOKEN")
bduss = cookie_dict.get("BDUSS") # 百度通用登录cookie
if stoken or ptoken or bduss:
utils.logger.info(f"[BaiduTieBaClient.pong] Login state verified by cookies (STOKEN: {bool(stoken)}, PTOKEN: {bool(ptoken)}, BDUSS: {bool(bduss)})")
return True
else: else:
utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...") utils.logger.info("[BaiduTieBaClient.pong] No valid login cookies found, need to login")
ping_flag = False return False
except Exception as e: except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...") utils.logger.error(f"[BaiduTieBaClient.pong] Check login state failed: {e}, assume not logged in")
ping_flag = False return False
return ping_flag
async def update_cookies(self, browser_context: BrowserContext): async def update_cookies(self, browser_context: BrowserContext):
""" """
@@ -149,7 +206,9 @@ class BaiduTieBaClient(AbstractApiClient):
Returns: Returns:
""" """
pass cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")
async def get_notes_by_keyword( async def get_notes_by_keyword(
self, self,
@@ -160,7 +219,7 @@ class BaiduTieBaClient(AbstractApiClient):
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
) -> List[TiebaNote]: ) -> List[TiebaNote]:
""" """
根据关键词搜索贴吧帖子 根据关键词搜索贴吧帖子 (使用Playwright访问页面,避免API检测)
Args: Args:
keyword: 关键词 keyword: 关键词
page: 分页第几页 page: 分页第几页
@@ -170,30 +229,81 @@ class BaiduTieBaClient(AbstractApiClient):
Returns: Returns:
""" """
uri = "/f/search/res" if not self.playwright_page:
utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based search")
# 构造搜索URL
# 示例: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=编程
search_url = f"{self._host}/f/search/res"
params = { params = {
"isnew": 1, "ie": "utf-8",
"qw": keyword, "qw": keyword,
"rn": page_size, "rn": page_size,
"pn": page, "pn": page,
"sm": sort.value, "sm": sort.value,
"only_thread": note_type.value, "only_thread": note_type.value,
} }
page_content = await self.get(uri, params=params, return_ori_content=True)
return self._page_extractor.extract_search_note_list(page_content) # 拼接完整URL
full_url = f"{search_url}?{urlencode(params)}"
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 访问搜索页面: {full_url}")
try:
# 使用Playwright访问搜索页面
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面HTML内容
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 成功获取搜索页面HTML,长度: {len(page_content)}")
# 提取搜索结果
notes = self._page_extractor.extract_search_note_list(page_content)
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 提取到 {len(notes)} 条帖子")
return notes
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] 搜索失败: {e}")
raise
async def get_note_by_id(self, note_id: str) -> TiebaNote: async def get_note_by_id(self, note_id: str) -> TiebaNote:
""" """
根据帖子ID获取帖子详情 根据帖子ID获取帖子详情 (使用Playwright访问页面,避免API检测)
Args: Args:
note_id: note_id: 帖子ID
Returns: Returns:
TiebaNote: 帖子详情对象
""" """
uri = f"/p/{note_id}" if not self.playwright_page:
page_content = await self.get(uri, return_ori_content=True) utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode")
return self._page_extractor.extract_note_detail(page_content) raise Exception("playwright_page is required for browser-based note detail fetching")
# 构造帖子详情URL
note_url = f"{self._host}/p/{note_id}"
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 访问帖子详情页面: {note_url}")
try:
# 使用Playwright访问帖子详情页面
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面HTML内容
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 成功获取帖子详情HTML,长度: {len(page_content)}")
# 提取帖子详情
note_detail = self._page_extractor.extract_note_detail(page_content)
return note_detail
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] 获取帖子详情失败: {e}")
raise
async def get_note_all_comments( async def get_note_all_comments(
self, self,
@@ -203,35 +313,68 @@ class BaiduTieBaClient(AbstractApiClient):
max_count: int = 10, max_count: int = 10,
) -> List[TiebaComment]: ) -> List[TiebaComment]:
""" """
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 获取指定帖子下的所有一级评论 (使用Playwright访问页面,避免API检测)
Args: Args:
note_detail: 帖子详情对象 note_detail: 帖子详情对象
crawl_interval: 爬取一次笔记的延迟单位(秒) crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后 callback: 一次笔记爬取结束后的回调函数
max_count: 一次帖子爬取的最大评论数量 max_count: 一次帖子爬取的最大评论数量
Returns: Returns:
List[TiebaComment]: 评论列表
""" """
uri = f"/p/{note_detail.note_id}" if not self.playwright_page:
utils.logger.error("[BaiduTieBaClient.get_note_all_comments] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based comment fetching")
result: List[TiebaComment] = [] result: List[TiebaComment] = []
current_page = 1 current_page = 1
while note_detail.total_replay_page >= current_page and len(result) < max_count: while note_detail.total_replay_page >= current_page and len(result) < max_count:
params = { # 构造评论页URL
"pn": current_page, comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}"
} utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 访问评论页面: {comment_url}")
page_content = await self.get(uri, params=params, return_ori_content=True)
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id) try:
if not comments: # 使用Playwright访问评论页面
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面HTML内容
page_content = await self.playwright_page.content()
# 提取评论
comments = self._page_extractor.extract_tieba_note_parment_comments(
page_content, note_id=note_detail.note_id
)
if not comments:
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
break
# 限制评论数量
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
if callback:
await callback(note_detail.note_id, comments)
result.extend(comments)
# 获取所有子评论
await self.get_comments_all_sub_comments(
comments, crawl_interval=crawl_interval, callback=callback
)
await asyncio.sleep(crawl_interval)
current_page += 1
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] 获取第{current_page}页评论失败: {e}")
break break
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)] utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 共获取 {len(result)} 条一级评论")
if callback:
await callback(note_detail.note_id, comments)
result.extend(comments)
# 获取所有子评论
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
await asyncio.sleep(crawl_interval)
current_page += 1
return result return result
async def get_comments_all_sub_comments( async def get_comments_all_sub_comments(
@@ -241,93 +384,194 @@ class BaiduTieBaClient(AbstractApiClient):
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
) -> List[TiebaComment]: ) -> List[TiebaComment]:
""" """
获取指定评论下的所有子评论 获取指定评论下的所有子评论 (使用Playwright访问页面,避免API检测)
Args: Args:
comments: 评论列表 comments: 评论列表
crawl_interval: 爬取一次笔记的延迟单位(秒) crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后 callback: 一次笔记爬取结束后的回调函数
Returns: Returns:
List[TiebaComment]: 子评论列表
""" """
uri = "/p/comment"
if not config.ENABLE_GET_SUB_COMMENTS: if not config.ENABLE_GET_SUB_COMMENTS:
return [] return []
# # 贴吧获取所有子评论需要登录态 if not self.playwright_page:
# if self.headers.get("Cookies") == "" or not self.pong(): utils.logger.error("[BaiduTieBaClient.get_comments_all_sub_comments] playwright_page is None, cannot use browser mode")
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") raise Exception("playwright_page is required for browser-based sub-comment fetching")
all_sub_comments: List[TiebaComment] = [] all_sub_comments: List[TiebaComment] = []
for parment_comment in comments: for parment_comment in comments:
if parment_comment.sub_comment_count == 0: if parment_comment.sub_comment_count == 0:
continue continue
current_page = 1 current_page = 1
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1 max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
while max_sub_page_num >= current_page:
params = {
"tid": parment_comment.note_id, # 帖子ID
"pid": parment_comment.comment_id, # 父级评论ID
"fid": parment_comment.tieba_id, # 贴吧ID
"pn": current_page # 页码
}
page_content = await self.get(uri, params=params, return_ori_content=True)
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment)
if not sub_comments: while max_sub_page_num >= current_page:
# 构造子评论URL
sub_comment_url = (
f"{self._host}/p/comment?"
f"tid={parment_comment.note_id}&"
f"pid={parment_comment.comment_id}&"
f"fid={parment_comment.tieba_id}&"
f"pn={current_page}"
)
utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 访问子评论页面: {sub_comment_url}")
try:
# 使用Playwright访问子评论页面
await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面HTML内容
page_content = await self.playwright_page.content()
# 提取子评论
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(
page_content, parent_comment=parment_comment
)
if not sub_comments:
utils.logger.info(
f"[BaiduTieBaClient.get_comments_all_sub_comments] "
f"评论{parment_comment.comment_id}{current_page}页没有子评论,停止爬取"
)
break
if callback:
await callback(parment_comment.note_id, sub_comments)
all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval)
current_page += 1
except Exception as e:
utils.logger.error(
f"[BaiduTieBaClient.get_comments_all_sub_comments] "
f"获取评论{parment_comment.comment_id}{current_page}页子评论失败: {e}"
)
break break
if callback:
await callback(parment_comment.note_id, sub_comments) utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 共获取 {len(all_sub_comments)} 条子评论")
all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval)
current_page += 1
return all_sub_comments return all_sub_comments
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
""" """
根据贴吧名称获取帖子列表 根据贴吧名称获取帖子列表 (使用Playwright访问页面,避免API检测)
Args: Args:
tieba_name: 贴吧名称 tieba_name: 贴吧名称
page_num: 分页数量 page_num: 分页页码
Returns: Returns:
List[TiebaNote]: 帖子列表
""" """
uri = f"/f?kw={tieba_name}&pn={page_num}" if not self.playwright_page:
page_content = await self.get(uri, return_ori_content=True) utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode")
return self._page_extractor.extract_tieba_note_list(page_content) raise Exception("playwright_page is required for browser-based tieba note fetching")
# 构造贴吧帖子列表URL
tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}"
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 访问贴吧页面: {tieba_url}")
try:
# 使用Playwright访问贴吧页面
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面HTML内容
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 成功获取贴吧页面HTML,长度: {len(page_content)}")
# 提取帖子列表
notes = self._page_extractor.extract_tieba_note_list(page_content)
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 提取到 {len(notes)} 条帖子")
return notes
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] 获取贴吧帖子列表失败: {e}")
raise
async def get_creator_info_by_url(self, creator_url: str) -> str: async def get_creator_info_by_url(self, creator_url: str) -> str:
""" """
根据创作者ID获取创作者信息 根据创作者URL获取创作者信息 (使用Playwright访问页面,避免API检测)
Args: Args:
creator_url: 创作者主页URL creator_url: 创作者主页URL
Returns: Returns:
str: 页面HTML内容
""" """
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True) if not self.playwright_page:
return page_content utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based creator info fetching")
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 访问创作者主页: {creator_url}")
try:
# 使用Playwright访问创作者主页
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面HTML内容
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 成功获取创作者主页HTML,长度: {len(page_content)}")
return page_content
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] 获取创作者主页失败: {e}")
raise
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict: async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
""" """
根据创作者获取创作者的所有帖子 根据创作者获取创作者的帖子 (使用Playwright访问页面,避免API检测)
Args: Args:
user_name: user_name: 创作者用户名
page_number: page_number: 页码
Returns: Returns:
Dict: 包含帖子数据的字典
""" """
uri = f"/home/get/getthread" if not self.playwright_page:
params = { utils.logger.error("[BaiduTieBaClient.get_notes_by_creator] playwright_page is None, cannot use browser mode")
"un": user_name, raise Exception("playwright_page is required for browser-based creator notes fetching")
"pn": page_number,
"id": "utf-8", # 构造创作者帖子列表URL
"_": utils.get_current_timestamp(), creator_url = f"{self._host}/home/get/getthread?un={quote(user_name)}&pn={page_number}&id=utf-8&_={utils.get_current_timestamp()}"
} utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 访问创作者帖子列表: {creator_url}")
return await self.get(uri, params=params)
try:
# 使用Playwright访问创作者帖子列表页面
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# 等待页面加载,使用配置文件中的延时设置
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# 获取页面内容(这个接口返回JSON)
page_content = await self.playwright_page.content()
# 提取JSON数据(页面会包含<pre>标签或直接是JSON)
try:
# 尝试从页面中提取JSON
json_text = await self.playwright_page.evaluate("() => document.body.innerText")
result = json.loads(json_text)
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 成功获取创作者帖子数据")
return result
except json.JSONDecodeError as e:
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON解析失败: {e}")
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 页面内容: {page_content[:500]}")
raise Exception(f"Failed to parse JSON from creator notes page: {e}")
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 获取创作者帖子列表失败: {e}")
raise
async def get_all_notes_by_creator_user_name( async def get_all_notes_by_creator_user_name(
self, self,

View File

@@ -11,7 +11,6 @@
import asyncio import asyncio
import os import os
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@@ -26,7 +25,7 @@ from playwright.async_api import (
import config import config
from base.base_crawler import AbstractCrawler from base.base_crawler import AbstractCrawler
from model.m_baidu_tieba import TiebaCreator, TiebaNote from model.m_baidu_tieba import TiebaCreator, TiebaNote
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from proxy.proxy_ip_pool import IpInfoModel, ProxyIpPool, create_ip_pool
from store import tieba as tieba_store from store import tieba as tieba_store
from tools import utils from tools import utils
from tools.cdp_browser import CDPBrowserManager from tools.cdp_browser import CDPBrowserManager
@@ -56,7 +55,7 @@ class TieBaCrawler(AbstractCrawler):
Returns: Returns:
""" """
ip_proxy_pool, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
utils.logger.info( utils.logger.info(
"[BaiduTieBaCrawler.start] Begin create ip proxy pool ..." "[BaiduTieBaCrawler.start] Begin create ip proxy pool ..."
@@ -65,31 +64,73 @@ class TieBaCrawler(AbstractCrawler):
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
) )
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
_, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
utils.logger.info( utils.logger.info(
f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}" f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}"
) )
# Create a client to interact with the baidutieba website. async with async_playwright() as playwright:
self.tieba_client = BaiduTieBaClient( # 根据配置选择启动模式
ip_pool=ip_proxy_pool, if config.ENABLE_CDP_MODE:
default_ip_proxy=httpx_proxy_format, utils.logger.info("[BaiduTieBaCrawler] 使用CDP模式启动浏览器")
) self.browser_context = await self.launch_browser_with_cdp(
crawler_type_var.set(config.CRAWLER_TYPE) playwright,
if config.CRAWLER_TYPE == "search": playwright_proxy_format,
# Search for notes and retrieve their comment information. self.user_agent,
await self.search() headless=config.CDP_HEADLESS,
await self.get_specified_tieba_notes() )
elif config.CRAWLER_TYPE == "detail": else:
# Get the information and comments of the specified post utils.logger.info("[BaiduTieBaCrawler] 使用标准模式启动浏览器")
await self.get_specified_notes() # Launch a browser context.
elif config.CRAWLER_TYPE == "creator": chromium = playwright.chromium
# Get creator's information and their notes and comments self.browser_context = await self.launch_browser(
await self.get_creators_and_notes() chromium,
else: playwright_proxy_format,
pass self.user_agent,
headless=config.HEADLESS,
)
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...") # 注入反检测脚本 - 针对百度的特殊检测
await self._inject_anti_detection_scripts()
self.context_page = await self.browser_context.new_page()
# 先访问百度首页,再点击贴吧链接,避免触发安全验证
await self._navigate_to_tieba_via_baidu()
# Create a client to interact with the baidutieba website.
self.tieba_client = await self.create_tieba_client(
httpx_proxy_format,
ip_proxy_pool if config.ENABLE_IP_PROXY else None
)
# Check login status and perform login if necessary
if not await self.tieba_client.pong(browser_context=self.browser_context):
login_obj = BaiduTieBaLogin(
login_type=config.LOGIN_TYPE,
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES,
)
await login_obj.begin()
await self.tieba_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
await self.get_specified_tieba_notes()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else:
pass
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
async def search(self) -> None: async def search(self) -> None:
""" """
@@ -347,6 +388,198 @@ class TieBaCrawler(AbstractCrawler):
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}" f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
) )
async def _navigate_to_tieba_via_baidu(self):
"""
模拟真实用户访问路径:
1. 先访问百度首页 (https://www.baidu.com/)
2. 等待页面加载
3. 点击顶部导航栏的"贴吧"链接
4. 跳转到贴吧首页
这样做可以避免触发百度的安全验证
"""
utils.logger.info("[TieBaCrawler] 模拟真实用户访问路径...")
try:
# Step 1: 访问百度首页
utils.logger.info("[TieBaCrawler] Step 1: 访问百度首页 https://www.baidu.com/")
await self.context_page.goto("https://www.baidu.com/", wait_until="domcontentloaded")
# Step 2: 等待页面加载,使用配置文件中的延时设置
utils.logger.info(f"[TieBaCrawler] Step 2: 等待 {config.CRAWLER_MAX_SLEEP_SEC}秒 模拟用户浏览...")
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# Step 3: 查找并点击"贴吧"链接
utils.logger.info("[TieBaCrawler] Step 3: 查找并点击'贴吧'链接...")
# 尝试多种选择器,确保能找到贴吧链接
tieba_selectors = [
'a[href="http://tieba.baidu.com/"]',
'a[href="https://tieba.baidu.com/"]',
'a.mnav:has-text("贴吧")',
'text=贴吧',
]
tieba_link = None
for selector in tieba_selectors:
try:
tieba_link = await self.context_page.wait_for_selector(selector, timeout=5000)
if tieba_link:
utils.logger.info(f"[TieBaCrawler] 找到贴吧链接 (selector: {selector})")
break
except Exception:
continue
if not tieba_link:
utils.logger.warning("[TieBaCrawler] 未找到贴吧链接,直接访问贴吧首页")
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
return
# Step 4: 点击贴吧链接 (检查是否会打开新标签页)
utils.logger.info("[TieBaCrawler] Step 4: 点击贴吧链接...")
# 检查链接的target属性
target_attr = await tieba_link.get_attribute("target")
utils.logger.info(f"[TieBaCrawler] 链接target属性: {target_attr}")
if target_attr == "_blank":
# 如果是新标签页,需要等待新页面并切换
utils.logger.info("[TieBaCrawler] 链接会在新标签页打开,等待新页面...")
async with self.browser_context.expect_page() as new_page_info:
await tieba_link.click()
# 获取新打开的页面
new_page = await new_page_info.value
await new_page.wait_for_load_state("domcontentloaded")
# 关闭旧的百度首页
await self.context_page.close()
# 切换到新的贴吧页面
self.context_page = new_page
utils.logger.info("[TieBaCrawler] ✅ 已切换到新标签页 (贴吧页面)")
else:
# 如果是同一标签页跳转,正常等待导航
utils.logger.info("[TieBaCrawler] 链接在当前标签页跳转...")
async with self.context_page.expect_navigation(wait_until="domcontentloaded"):
await tieba_link.click()
# Step 5: 等待页面稳定,使用配置文件中的延时设置
utils.logger.info(f"[TieBaCrawler] Step 5: 页面加载完成,等待 {config.CRAWLER_MAX_SLEEP_SEC}秒...")
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
current_url = self.context_page.url
utils.logger.info(f"[TieBaCrawler] ✅ 成功通过百度首页进入贴吧! 当前URL: {current_url}")
except Exception as e:
utils.logger.error(f"[TieBaCrawler] 通过百度首页访问贴吧失败: {e}")
utils.logger.info("[TieBaCrawler] 回退:直接访问贴吧首页")
await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
async def _inject_anti_detection_scripts(self):
"""
注入反检测JavaScript脚本
针对百度贴吧的特殊检测机制
"""
utils.logger.info("[TieBaCrawler] Injecting anti-detection scripts...")
# 轻量级反检测脚本,只覆盖关键检测点
anti_detection_js = """
// 覆盖 navigator.webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
configurable: true
});
// 覆盖 window.navigator.chrome
if (!window.navigator.chrome) {
window.navigator.chrome = {
runtime: {},
loadTimes: function() {},
csi: function() {},
app: {}
};
}
// 覆盖 Permissions API
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
// 覆盖 plugins 长度(让它看起来有插件)
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
configurable: true
});
// 覆盖 languages
Object.defineProperty(navigator, 'languages', {
get: () => ['zh-CN', 'zh', 'en'],
configurable: true
});
// 移除 window.cdc_ 等 ChromeDriver 残留
delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
console.log('[Anti-Detection] Scripts injected successfully');
"""
await self.browser_context.add_init_script(anti_detection_js)
utils.logger.info("[TieBaCrawler] Anti-detection scripts injected")
async def create_tieba_client(
self, httpx_proxy: Optional[str], ip_pool: Optional[ProxyIpPool] = None
) -> BaiduTieBaClient:
"""
Create tieba client with real browser User-Agent and complete headers
Args:
httpx_proxy: HTTP代理
ip_pool: IP代理池
Returns:
BaiduTieBaClient实例
"""
utils.logger.info("[TieBaCrawler.create_tieba_client] Begin create tieba API client...")
# 从真实浏览器提取User-Agent,避免被检测
user_agent = await self.context_page.evaluate("() => navigator.userAgent")
utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
# 构建完整的浏览器请求头,模拟真实浏览器行为
tieba_client = BaiduTieBaClient(
timeout=10,
ip_pool=ip_pool,
default_ip_proxy=httpx_proxy,
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"User-Agent": user_agent, # 使用真实浏览器的UA
"Cookie": cookie_str,
"Host": "tieba.baidu.com",
"Referer": "https://tieba.baidu.com/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"sec-ch-ua": '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
},
playwright_page=self.context_page, # 传入playwright页面对象
)
return tieba_client
async def launch_browser( async def launch_browser(
self, self,
chromium: BrowserType, chromium: BrowserType,
@@ -381,10 +614,11 @@ class TieBaCrawler(AbstractCrawler):
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
user_agent=user_agent, user_agent=user_agent,
channel="chrome", # 使用系统的Chrome稳定版
) )
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
browser_context = await browser.new_context( browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080}, user_agent=user_agent viewport={"width": 1920, "height": 1080}, user_agent=user_agent
) )

View File

@@ -23,6 +23,7 @@ from urllib.parse import parse_qs, unquote, urlencode
import httpx import httpx
from httpx import Response from httpx import Response
from playwright.async_api import BrowserContext, Page from playwright.async_api import BrowserContext, Page
from tenacity import retry, stop_after_attempt, wait_fixed
import config import config
from tools import utils from tools import utils
@@ -50,6 +51,7 @@ class WeiboClient:
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
self._image_agent_host = "https://i1.wp.com/" self._image_agent_host = "https://i1.wp.com/"
@retry(stop=stop_after_attempt(5), wait=wait_fixed(3))
async def request(self, method, url, **kwargs) -> Union[Response, Dict]: async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
enable_return_response = kwargs.pop("return_response", False) enable_return_response = kwargs.pop("return_response", False)
async with httpx.AsyncClient(proxy=self.proxy) as client: async with httpx.AsyncClient(proxy=self.proxy) as client:
@@ -58,7 +60,16 @@ class WeiboClient:
if enable_return_response: if enable_return_response:
return response return response
data: Dict = response.json() try:
data: Dict = response.json()
except json.decoder.JSONDecodeError:
# issue: #771 搜索接口会报错432 多次重试 + 更新 h5 cookies
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err code: {response.status_code} res:{response.text}")
await self.playwright_page.goto(self._host)
await asyncio.sleep(2)
await self.update_cookies(browser_context=self.playwright_page.context)
raise DataFetchError(f"get response code error: {response.status_code}")
ok_code = data.get("ok") ok_code = data.get("ok")
if ok_code == 0: # response error if ok_code == 0: # response error
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}") utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
@@ -99,10 +110,24 @@ class WeiboClient:
ping_flag = False ping_flag = False
return ping_flag return ping_flag
async def update_cookies(self, browser_context: BrowserContext): async def update_cookies(self, browser_context: BrowserContext, urls: Optional[List[str]] = None):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) """
Update cookies from browser context
:param browser_context: Browser context
:param urls: Optional list of URLs to filter cookies (e.g., ["https://m.weibo.cn"])
If provided, only cookies for these URLs will be retrieved
"""
if urls:
cookies = await browser_context.cookies(urls=urls)
utils.logger.info(f"[WeiboClient.update_cookies] Updating cookies for specific URLs: {urls}")
else:
cookies = await browser_context.cookies()
utils.logger.info("[WeiboClient.update_cookies] Updating all cookies")
cookie_str, cookie_dict = utils.convert_cookies(cookies)
self.headers["Cookie"] = cookie_str self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
utils.logger.info(f"[WeiboClient.update_cookies] Cookie updated successfully, total: {len(cookie_dict)} cookies")
async def get_note_by_keyword( async def get_note_by_keyword(
self, self,
@@ -288,27 +313,14 @@ class WeiboClient:
""" """
uri = "/api/container/getIndex" uri = "/api/container/getIndex"
container_info = await self.get_creator_container_info(creator_id) containerid = f"100505{creator_id}"
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
raise DataFetchError("get containerid failed")
params = { params = {
"jumpfrom": "weibocom", "jumpfrom": "weibocom",
"type": "uid", "type": "uid",
"value": creator_id, "value": creator_id,
"containerid": container_info["fid_container_id"], "containerid":containerid,
} }
user_res = await self.get(uri, params) user_res = await self.get(uri, params)
if user_res.get("tabsInfo"):
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
for tab in tabs:
if tab.get("tabKey") == "weibo":
container_info["lfid_container_id"] = tab.get("containerid")
break
user_res.update(container_info)
return user_res return user_res
async def get_notes_by_creator( async def get_notes_by_creator(

View File

@@ -77,11 +77,16 @@ class WeiboCrawler(AbstractCrawler):
# Launch a browser context. # Launch a browser context.
chromium = playwright.chromium chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS) self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.mobile_index_url)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url)
await asyncio.sleep(2)
# Create a client to interact with the xiaohongshu website. # Create a client to interact with the xiaohongshu website.
self.wb_client = await self.create_weibo_client(httpx_proxy_format) self.wb_client = await self.create_weibo_client(httpx_proxy_format)
if not await self.wb_client.pong(): if not await self.wb_client.pong():
@@ -97,8 +102,12 @@ class WeiboCrawler(AbstractCrawler):
# 登录成功后重定向到手机端的网站再更新手机端登录成功的cookie # 登录成功后重定向到手机端的网站再更新手机端登录成功的cookie
utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform") utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
await self.context_page.goto(self.mobile_index_url) await self.context_page.goto(self.mobile_index_url)
await asyncio.sleep(2) await asyncio.sleep(3)
await self.wb_client.update_cookies(browser_context=self.browser_context) # 只获取移动端的 cookies避免 PC 端和移动端 cookies 混淆
await self.wb_client.update_cookies(
browser_context=self.browser_context,
urls=[self.mobile_index_url]
)
crawler_type_var.set(config.CRAWLER_TYPE) crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
@@ -290,7 +299,7 @@ class WeiboCrawler(AbstractCrawler):
# Get all note information of the creator # Get all note information of the creator
all_notes_list = await self.wb_client.get_all_notes_by_creator_id( all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
creator_id=user_id, creator_id=user_id,
container_id=createor_info_res.get("lfid_container_id"), container_id=f"107603{user_id}",
crawl_interval=0, crawl_interval=0,
callback=weibo_store.batch_update_weibo_notes, callback=weibo_store.batch_update_weibo_notes,
) )
@@ -304,7 +313,7 @@ class WeiboCrawler(AbstractCrawler):
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient: async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...") utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies(urls=[self.mobile_index_url]))
weibo_client_obj = WeiboClient( weibo_client_obj = WeiboClient(
proxy=httpx_proxy, proxy=httpx_proxy,
headers={ headers={
@@ -340,10 +349,11 @@ class WeiboCrawler(AbstractCrawler):
"height": 1080 "height": 1080
}, },
user_agent=user_agent, user_agent=user_agent,
channel="chrome", # 使用系统的Chrome稳定版
) )
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent) browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
return browser_context return browser_context

View File

@@ -10,18 +10,20 @@
import asyncio import asyncio
import json import json
import re import time
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode from urllib.parse import urlencode, urlparse, parse_qs
import httpx import httpx
from playwright.async_api import BrowserContext, Page from playwright.async_api import BrowserContext, Page
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result from tenacity import retry, stop_after_attempt, wait_fixed
from xhshow import Xhshow
import config import config
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
from tools import utils from tools import utils
from html import unescape
from .exception import DataFetchError, IPBlockError from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType from .field import SearchNoteType, SearchSortType
@@ -52,26 +54,52 @@ class XiaoHongShuClient(AbstractApiClient):
self.playwright_page = playwright_page self.playwright_page = playwright_page
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
self._extractor = XiaoHongShuExtractor() self._extractor = XiaoHongShuExtractor()
# 初始化 xhshow 客户端用于签名生成
self._xhshow_client = Xhshow()
async def _pre_headers(self, url: str, data=None) -> Dict: async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
""" """请求头参数签名
请求头参数签名
Args: Args:
url: url: 请求的URL(GET请求是包含请求的参数)
data: params: GET请求的参数
payload: POST请求的参数
Returns: Returns:
Dict: 请求头参数签名
"""
a1_value = self.cookie_dict.get("a1", "")
parsed = urlparse(url)
uri = parsed.path
if params is not None:
x_s = self._xhshow_client.sign_xs_get(
uri=uri, a1_value=a1_value, params=params
)
elif payload is not None:
x_s = self._xhshow_client.sign_xs_post(
uri=uri, a1_value=a1_value, payload=payload
)
else:
raise ValueError("params or payload is required")
# 获取 b1 值
b1_value = ""
try:
if self.playwright_page:
local_storage = await self.playwright_page.evaluate(
"() => window.localStorage"
)
b1_value = local_storage.get("b1", "")
except Exception as e:
utils.logger.warning(
f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}"
)
"""
encrypt_params = await self.playwright_page.evaluate(
"([url, data]) => window._webmsxyw(url,data)", [url, data]
)
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
signs = sign( signs = sign(
a1=self.cookie_dict.get("a1", ""), a1=a1_value,
b1=local_storage.get("b1", ""), b1=b1_value,
x_s=encrypt_params.get("X-s", ""), x_s=x_s,
x_t=str(encrypt_params.get("X-t", "")), x_t=str(int(time.time() * 1000)),
) )
headers = { headers = {
@@ -116,9 +144,10 @@ class XiaoHongShuClient(AbstractApiClient):
elif data["code"] == self.IP_ERROR_CODE: elif data["code"] == self.IP_ERROR_CODE:
raise IPBlockError(self.IP_ERROR_STR) raise IPBlockError(self.IP_ERROR_STR)
else: else:
raise DataFetchError(data.get("msg", None)) err_msg = data.get("msg", None) or f"{response.text}"
raise DataFetchError(err_msg)
async def get(self, uri: str, params=None) -> Dict: async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
""" """
GET请求对请求头签名 GET请求对请求头签名
Args: Args:
@@ -128,12 +157,18 @@ class XiaoHongShuClient(AbstractApiClient):
Returns: Returns:
""" """
final_uri = uri headers = await self._pre_headers(uri, params)
if isinstance(params, dict): if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}" # 使用 xhsshow build_url 构建完整的 URL
headers = await self._pre_headers(final_uri) full_url = self._xhshow_client.build_url(
base_url=f"{self._host}{uri}",
params=params
)
else:
full_url = f"{self._host}{uri}"
return await self.request( return await self.request(
method="GET", url=f"{self._host}{final_uri}", headers=headers method="GET", url=full_url, headers=headers
) )
async def post(self, uri: str, data: dict, **kwargs) -> Dict: async def post(self, uri: str, data: dict, **kwargs) -> Dict:
@@ -146,8 +181,8 @@ class XiaoHongShuClient(AbstractApiClient):
Returns: Returns:
""" """
headers = await self._pre_headers(uri, data) headers = await self._pre_headers(uri, payload=data)
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) json_str = self._xhshow_client.build_json_body(payload=data)
return await self.request( return await self.request(
method="POST", method="POST",
url=f"{self._host}{uri}", url=f"{self._host}{uri}",
@@ -481,6 +516,8 @@ class XiaoHongShuClient(AbstractApiClient):
creator: str, creator: str,
cursor: str, cursor: str,
page_size: int = 30, page_size: int = 30,
xsec_token: str = "",
xsec_source: str = "pc_feed",
) -> Dict: ) -> Dict:
""" """
获取博主的笔记 获取博主的笔记
@@ -488,24 +525,29 @@ class XiaoHongShuClient(AbstractApiClient):
creator: 博主ID creator: 博主ID
cursor: 上一页最后一条笔记的ID cursor: 上一页最后一条笔记的ID
page_size: 分页数据长度 page_size: 分页数据长度
xsec_token: 验证token
xsec_source: 渠道来源
Returns: Returns:
""" """
uri = "/api/sns/web/v1/user_posted" uri = f"/api/sns/web/v1/user_posted"
data = { params = {
"user_id": creator,
"cursor": cursor,
"num": page_size, "num": page_size,
"image_formats": "jpg,webp,avif", "cursor": cursor,
"user_id": creator,
"xsec_token": xsec_token,
"xsec_source": xsec_source,
} }
return await self.get(uri, data) return await self.get(uri, params)
async def get_all_notes_by_creator( async def get_all_notes_by_creator(
self, self,
user_id: str, user_id: str,
crawl_interval: float = 1.0, crawl_interval: float = 1.0,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
xsec_token: str = "",
xsec_source: str = "pc_feed",
) -> List[Dict]: ) -> List[Dict]:
""" """
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
@@ -513,6 +555,8 @@ class XiaoHongShuClient(AbstractApiClient):
user_id: 用户ID user_id: 用户ID
crawl_interval: 爬取一次的延迟单位(秒) crawl_interval: 爬取一次的延迟单位(秒)
callback: 一次分页爬取结束后的更新回调函数 callback: 一次分页爬取结束后的更新回调函数
xsec_token: 验证token
xsec_source: 渠道来源
Returns: Returns:
@@ -521,7 +565,9 @@ class XiaoHongShuClient(AbstractApiClient):
notes_has_more = True notes_has_more = True
notes_cursor = "" notes_cursor = ""
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
notes_res = await self.get_notes_by_creator(user_id, notes_cursor) notes_res = await self.get_notes_by_creator(
user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source
)
if not notes_res: if not notes_res:
utils.logger.error( utils.logger.error(
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."

View File

@@ -79,8 +79,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.user_agent, self.user_agent,
headless=config.HEADLESS, headless=config.HEADLESS,
) )
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url) await self.context_page.goto(self.index_url)
@@ -200,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
user_id=user_id, user_id=user_id,
crawl_interval=crawl_interval, crawl_interval=crawl_interval,
callback=self.fetch_creator_notes_detail, callback=self.fetch_creator_notes_detail,
xsec_token=creator_info.xsec_token,
xsec_source=creator_info.xsec_source,
) )
note_ids = [] note_ids = []
@@ -278,17 +281,17 @@ class XiaoHongShuCrawler(AbstractCrawler):
Dict: note detail Dict: note detail
""" """
note_detail = None note_detail = None
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
async with semaphore: async with semaphore:
try: try:
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
try: try:
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
except RetryError: except RetryError:
pass pass
if not note_detail: if not note_detail:
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True) note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
enable_cookie=True)
if not note_detail: if not note_detail:
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")

View File

@@ -27,16 +27,17 @@ def sign(a1="", b1="", x_s="", x_t=""):
"s0": 3, # getPlatformCode "s0": 3, # getPlatformCode
"s1": "", "s1": "",
"x0": "1", # localStorage.getItem("b1b1") "x0": "1", # localStorage.getItem("b1b1")
"x1": "3.7.8-2", # version "x1": "4.2.2", # version
"x2": "Mac OS", "x2": "Mac OS",
"x3": "xhs-pc-web", "x3": "xhs-pc-web",
"x4": "4.27.2", "x4": "4.74.0",
"x5": a1, # cookie of a1 "x5": a1, # cookie of a1
"x6": x_t, "x6": x_t,
"x7": x_s, "x7": x_s,
"x8": b1, # localStorage.getItem("b1") "x8": b1, # localStorage.getItem("b1")
"x9": mrc(x_t + x_s + b1), "x9": mrc(x_t + x_s + b1),
"x10": 154, # getSigCount "x10": 154, # getSigCount
"x11": "normal"
} }
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':'))) encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
x_s_common = b64Encode(encode_str) x_s_common = b64Encode(encode_str)

View File

@@ -86,8 +86,8 @@ class ZhihuCrawler(AbstractCrawler):
self.browser_context = await self.launch_browser( self.browser_context = await self.launch_browser(
chromium, None, self.user_agent, headless=config.HEADLESS chromium, None, self.user_agent, headless=config.HEADLESS
) )
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url, wait_until="domcontentloaded") await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
@@ -429,10 +429,11 @@ class ZhihuCrawler(AbstractCrawler):
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
user_agent=user_agent, user_agent=user_agent,
channel="chrome", # 使用系统的Chrome稳定版
) )
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy, channel="chrome") # type: ignore
browser_context = await browser.new_context( browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080}, user_agent=user_agent viewport={"width": 1920, "height": 1080}, user_agent=user_agent
) )

View File

@@ -16,6 +16,7 @@ dependencies = [
"httpx==0.28.1", "httpx==0.28.1",
"jieba==0.42.1", "jieba==0.42.1",
"matplotlib==3.9.0", "matplotlib==3.9.0",
"motor>=3.3.0",
"opencv-python>=4.11.0.86", "opencv-python>=4.11.0.86",
"pandas==2.2.3", "pandas==2.2.3",
"parsel==1.9.1", "parsel==1.9.1",
@@ -32,6 +33,7 @@ dependencies = [
"typer>=0.12.3", "typer>=0.12.3",
"uvicorn==0.29.0", "uvicorn==0.29.0",
"wordcloud==1.9.3", "wordcloud==1.9.3",
"xhshow>=0.1.3",
] ]
[[tool.uv.index]] [[tool.uv.index]]

View File

@@ -24,3 +24,5 @@ cryptography>=45.0.7
alembic>=1.16.5 alembic>=1.16.5
asyncmy>=0.2.10 asyncmy>=0.2.10
sqlalchemy>=2.0.43 sqlalchemy>=2.0.43
motor>=3.3.0
xhshow>=0.1.3

View File

@@ -28,13 +28,14 @@ class BiliStoreFactory:
"db": BiliDbStoreImplement, "db": BiliDbStoreImplement,
"json": BiliJsonStoreImplement, "json": BiliJsonStoreImplement,
"sqlite": BiliSqliteStoreImplement, "sqlite": BiliSqliteStoreImplement,
"mongodb": BiliMongoStoreImplement,
} }
@staticmethod @staticmethod
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()

View File

@@ -31,13 +31,14 @@ from database.models import BilibiliVideoComment, BilibiliVideo, BilibiliUpInfo,
from tools.async_file_writer import AsyncFileWriter from tools.async_file_writer import AsyncFileWriter
from tools import utils, words from tools import utils, words
from var import crawler_type_var from var import crawler_type_var
from database.mongodb_store_base import MongoDBStoreBase
class BiliCsvStoreImplement(AbstractStore): class BiliCsvStoreImplement(AbstractStore):
def __init__(self): def __init__(self):
self.file_writer = AsyncFileWriter( self.file_writer = AsyncFileWriter(
crawler_type=crawler_type_var.get(), crawler_type=crawler_type_var.get(),
platform="bilibili" platform="bili"
) )
async def store_content(self, content_item: Dict): async def store_content(self, content_item: Dict):
@@ -220,7 +221,7 @@ class BiliJsonStoreImplement(AbstractStore):
def __init__(self): def __init__(self):
self.file_writer = AsyncFileWriter( self.file_writer = AsyncFileWriter(
crawler_type=crawler_type_var.get(), crawler_type=crawler_type_var.get(),
platform="bilibili" platform="bili"
) )
async def store_content(self, content_item: Dict): async def store_content(self, content_item: Dict):
@@ -297,3 +298,61 @@ class BiliJsonStoreImplement(AbstractStore):
class BiliSqliteStoreImplement(BiliDbStoreImplement): class BiliSqliteStoreImplement(BiliDbStoreImplement):
pass pass
class BiliMongoStoreImplement(AbstractStore):
"""B站MongoDB存储实现"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="bilibili")
async def store_content(self, content_item: Dict):
"""
存储视频内容到MongoDB
Args:
content_item: 视频内容数据
"""
video_id = content_item.get("video_id")
if not video_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"video_id": video_id},
data=content_item
)
utils.logger.info(f"[BiliMongoStoreImplement.store_content] Saved video {video_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[BiliMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储UP主信息到MongoDB
Args:
creator_item: UP主数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[BiliMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -22,7 +22,7 @@ from tools import utils
class BilibiliVideo(AbstractStoreVideo): class BilibiliVideo(AbstractStoreVideo):
video_store_path: str = "data/bilibili/videos" video_store_path: str = "data/bili/videos"
async def store_video(self, video_content_item: Dict): async def store_video(self, video_content_item: Dict):
""" """

View File

@@ -27,13 +27,14 @@ class DouyinStoreFactory:
"db": DouyinDbStoreImplement, "db": DouyinDbStoreImplement,
"json": DouyinJsonStoreImplement, "json": DouyinJsonStoreImplement,
"sqlite": DouyinSqliteStoreImplement, "sqlite": DouyinSqliteStoreImplement,
"mongodb": DouyinMongoStoreImplement,
} }
@staticmethod @staticmethod
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = DouyinStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = DouyinStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") raise ValueError("[DouyinStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()

View File

@@ -28,6 +28,7 @@ from database.models import DouyinAweme, DouyinAwemeComment, DyCreator
from tools import utils, words from tools import utils, words
from tools.async_file_writer import AsyncFileWriter from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var from var import crawler_type_var
from database.mongodb_store_base import MongoDBStoreBase
class DouyinCsvStoreImplement(AbstractStore): class DouyinCsvStoreImplement(AbstractStore):
@@ -195,4 +196,62 @@ class DouyinJsonStoreImplement(AbstractStore):
class DouyinSqliteStoreImplement(DouyinDbStoreImplement): class DouyinSqliteStoreImplement(DouyinDbStoreImplement):
pass pass
class DouyinMongoStoreImplement(AbstractStore):
"""抖音MongoDB存储实现"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="douyin")
async def store_content(self, content_item: Dict):
"""
存储视频内容到MongoDB
Args:
content_item: 视频内容数据
"""
aweme_id = content_item.get("aweme_id")
if not aweme_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"aweme_id": aweme_id},
data=content_item
)
utils.logger.info(f"[DouyinMongoStoreImplement.store_content] Saved aweme {aweme_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[DouyinMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储创作者信息到MongoDB
Args:
creator_item: 创作者数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[DouyinMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -26,7 +26,8 @@ class KuaishouStoreFactory:
"csv": KuaishouCsvStoreImplement, "csv": KuaishouCsvStoreImplement,
"db": KuaishouDbStoreImplement, "db": KuaishouDbStoreImplement,
"json": KuaishouJsonStoreImplement, "json": KuaishouJsonStoreImplement,
"sqlite": KuaishouSqliteStoreImplement "sqlite": KuaishouSqliteStoreImplement,
"mongodb": KuaishouMongoStoreImplement,
} }
@staticmethod @staticmethod
@@ -34,7 +35,7 @@ class KuaishouStoreFactory:
store_class = KuaishouStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = KuaishouStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError( raise ValueError(
"[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") "[KuaishouStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()

View File

@@ -30,6 +30,7 @@ from database.db_session import get_session
from database.models import KuaishouVideo, KuaishouVideoComment from database.models import KuaishouVideo, KuaishouVideoComment
from tools import utils, words from tools import utils, words
from var import crawler_type_var from var import crawler_type_var
from database.mongodb_store_base import MongoDBStoreBase
def calculate_number_of_files(file_store_path: str) -> int: def calculate_number_of_files(file_store_path: str) -> int:
@@ -157,4 +158,62 @@ class KuaishouJsonStoreImplement(AbstractStore):
class KuaishouSqliteStoreImplement(KuaishouDbStoreImplement): class KuaishouSqliteStoreImplement(KuaishouDbStoreImplement):
async def store_creator(self, creator: Dict): async def store_creator(self, creator: Dict):
pass pass
class KuaishouMongoStoreImplement(AbstractStore):
"""快手MongoDB存储实现"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="kuaishou")
async def store_content(self, content_item: Dict):
"""
存储视频内容到MongoDB
Args:
content_item: 视频内容数据
"""
video_id = content_item.get("video_id")
if not video_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"video_id": video_id},
data=content_item
)
utils.logger.info(f"[KuaishouMongoStoreImplement.store_content] Saved video {video_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[KuaishouMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储创作者信息到MongoDB
Args:
creator_item: 创作者数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[KuaishouMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -23,7 +23,8 @@ class TieBaStoreFactory:
"csv": TieBaCsvStoreImplement, "csv": TieBaCsvStoreImplement,
"db": TieBaDbStoreImplement, "db": TieBaDbStoreImplement,
"json": TieBaJsonStoreImplement, "json": TieBaJsonStoreImplement,
"sqlite": TieBaSqliteStoreImplement "sqlite": TieBaSqliteStoreImplement,
"mongodb": TieBaMongoStoreImplement,
} }
@staticmethod @staticmethod
@@ -31,7 +32,7 @@ class TieBaStoreFactory:
store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError( raise ValueError(
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...") "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()

View File

@@ -31,6 +31,7 @@ from tools import utils, words
from database.db_session import get_session from database.db_session import get_session
from var import crawler_type_var from var import crawler_type_var
from tools.async_file_writer import AsyncFileWriter from tools.async_file_writer import AsyncFileWriter
from database.mongodb_store_base import MongoDBStoreBase
def calculate_number_of_files(file_store_path: str) -> int: def calculate_number_of_files(file_store_path: str) -> int:
@@ -190,3 +191,61 @@ class TieBaSqliteStoreImplement(TieBaDbStoreImplement):
Tieba sqlite store implement Tieba sqlite store implement
""" """
pass pass
class TieBaMongoStoreImplement(AbstractStore):
"""贴吧MongoDB存储实现"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="tieba")
async def store_content(self, content_item: Dict):
"""
存储帖子内容到MongoDB
Args:
content_item: 帖子内容数据
"""
note_id = content_item.get("note_id")
if not note_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"note_id": note_id},
data=content_item
)
utils.logger.info(f"[TieBaMongoStoreImplement.store_content] Saved note {note_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[TieBaMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储创作者信息到MongoDB
Args:
creator_item: 创作者数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[TieBaMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -28,13 +28,14 @@ class WeibostoreFactory:
"db": WeiboDbStoreImplement, "db": WeiboDbStoreImplement,
"json": WeiboJsonStoreImplement, "json": WeiboJsonStoreImplement,
"sqlite": WeiboSqliteStoreImplement, "sqlite": WeiboSqliteStoreImplement,
"mongodb": WeiboMongoStoreImplement,
} }
@staticmethod @staticmethod
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()

View File

@@ -31,6 +31,7 @@ from tools import utils, words
from tools.async_file_writer import AsyncFileWriter from tools.async_file_writer import AsyncFileWriter
from database.db_session import get_session from database.db_session import get_session
from var import crawler_type_var from var import crawler_type_var
from database.mongodb_store_base import MongoDBStoreBase
def calculate_number_of_files(file_store_path: str) -> int: def calculate_number_of_files(file_store_path: str) -> int:
@@ -212,3 +213,61 @@ class WeiboSqliteStoreImplement(WeiboDbStoreImplement):
Weibo content SQLite storage implementation Weibo content SQLite storage implementation
""" """
pass pass
class WeiboMongoStoreImplement(AbstractStore):
"""微博MongoDB存储实现"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="weibo")
async def store_content(self, content_item: Dict):
"""
存储微博内容到MongoDB
Args:
content_item: 微博内容数据
"""
note_id = content_item.get("note_id")
if not note_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"note_id": note_id},
data=content_item
)
utils.logger.info(f"[WeiboMongoStoreImplement.store_content] Saved note {note_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[WeiboMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储创作者信息到MongoDB
Args:
creator_item: 创作者数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[WeiboMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -27,13 +27,14 @@ class XhsStoreFactory:
"db": XhsDbStoreImplement, "db": XhsDbStoreImplement,
"json": XhsJsonStoreImplement, "json": XhsJsonStoreImplement,
"sqlite": XhsSqliteStoreImplement, "sqlite": XhsSqliteStoreImplement,
"mongodb": XhsMongoStoreImplement,
} }
@staticmethod @staticmethod
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = XhsStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = XhsStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError("[XhsStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") raise ValueError("[XhsStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()

View File

@@ -18,6 +18,8 @@ from database.models import XhsNote, XhsNoteComment, XhsCreator
from tools.async_file_writer import AsyncFileWriter from tools.async_file_writer import AsyncFileWriter
from tools.time_util import get_current_timestamp from tools.time_util import get_current_timestamp
from var import crawler_type_var from var import crawler_type_var
from database.mongodb_store_base import MongoDBStoreBase
from tools import utils
class XhsCsvStoreImplement(AbstractStore): class XhsCsvStoreImplement(AbstractStore):
def __init__(self, **kwargs): def __init__(self, **kwargs):
@@ -258,3 +260,62 @@ class XhsDbStoreImplement(AbstractStore):
class XhsSqliteStoreImplement(XhsDbStoreImplement): class XhsSqliteStoreImplement(XhsDbStoreImplement):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
class XhsMongoStoreImplement(AbstractStore):
"""小红书MongoDB存储实现"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.mongo_store = MongoDBStoreBase(collection_prefix="xhs")
async def store_content(self, content_item: Dict):
"""
存储笔记内容到MongoDB
Args:
content_item: 笔记内容数据
"""
note_id = content_item.get("note_id")
if not note_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"note_id": note_id},
data=content_item
)
utils.logger.info(f"[XhsMongoStoreImplement.store_content] Saved note {note_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[XhsMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储创作者信息到MongoDB
Args:
creator_item: 创作者数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[XhsMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -18,7 +18,8 @@ from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator
from ._store_impl import (ZhihuCsvStoreImplement, from ._store_impl import (ZhihuCsvStoreImplement,
ZhihuDbStoreImplement, ZhihuDbStoreImplement,
ZhihuJsonStoreImplement, ZhihuJsonStoreImplement,
ZhihuSqliteStoreImplement) ZhihuSqliteStoreImplement,
ZhihuMongoStoreImplement)
from tools import utils from tools import utils
from var import source_keyword_var from var import source_keyword_var
@@ -28,14 +29,15 @@ class ZhihuStoreFactory:
"csv": ZhihuCsvStoreImplement, "csv": ZhihuCsvStoreImplement,
"db": ZhihuDbStoreImplement, "db": ZhihuDbStoreImplement,
"json": ZhihuJsonStoreImplement, "json": ZhihuJsonStoreImplement,
"sqlite": ZhihuSqliteStoreImplement "sqlite": ZhihuSqliteStoreImplement,
"mongodb": ZhihuMongoStoreImplement,
} }
@staticmethod @staticmethod
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...") raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite or mongodb ...")
return store_class() return store_class()
async def batch_update_zhihu_contents(contents: List[ZhihuContent]): async def batch_update_zhihu_contents(contents: List[ZhihuContent]):

View File

@@ -31,6 +31,7 @@ from database.models import ZhihuContent, ZhihuComment, ZhihuCreator
from tools import utils, words from tools import utils, words
from var import crawler_type_var from var import crawler_type_var
from tools.async_file_writer import AsyncFileWriter from tools.async_file_writer import AsyncFileWriter
from database.mongodb_store_base import MongoDBStoreBase
def calculate_number_of_files(file_store_path: str) -> int: def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@@ -189,3 +190,61 @@ class ZhihuSqliteStoreImplement(ZhihuDbStoreImplement):
Zhihu content SQLite storage implementation Zhihu content SQLite storage implementation
""" """
pass pass
class ZhihuMongoStoreImplement(AbstractStore):
"""知乎MongoDB存储实现"""
def __init__(self):
self.mongo_store = MongoDBStoreBase(collection_prefix="zhihu")
async def store_content(self, content_item: Dict):
"""
存储内容到MongoDB
Args:
content_item: 内容数据
"""
note_id = content_item.get("note_id")
if not note_id:
return
await self.mongo_store.save_or_update(
collection_suffix="contents",
query={"note_id": note_id},
data=content_item
)
utils.logger.info(f"[ZhihuMongoStoreImplement.store_content] Saved note {note_id} to MongoDB")
async def store_comment(self, comment_item: Dict):
"""
存储评论到MongoDB
Args:
comment_item: 评论数据
"""
comment_id = comment_item.get("comment_id")
if not comment_id:
return
await self.mongo_store.save_or_update(
collection_suffix="comments",
query={"comment_id": comment_id},
data=comment_item
)
utils.logger.info(f"[ZhihuMongoStoreImplement.store_comment] Saved comment {comment_id} to MongoDB")
async def store_creator(self, creator_item: Dict):
"""
存储创作者信息到MongoDB
Args:
creator_item: 创作者数据
"""
user_id = creator_item.get("user_id")
if not user_id:
return
await self.mongo_store.save_or_update(
collection_suffix="creators",
query={"user_id": user_id},
data=creator_item
)
utils.logger.info(f"[ZhihuMongoStoreImplement.store_creator] Saved creator {user_id} to MongoDB")

View File

@@ -0,0 +1,368 @@
# -*- coding: utf-8 -*-
import asyncio
import unittest
import sys
import os
from datetime import datetime
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from database.mongodb_store_base import MongoDBConnection, MongoDBStoreBase
from store.xhs._store_impl import XhsMongoStoreImplement
from store.douyin._store_impl import DouyinMongoStoreImplement
from config import db_config
class TestMongoDBRealConnection(unittest.TestCase):
@classmethod
def setUpClass(cls):
try:
conn = MongoDBConnection()
asyncio.run(conn._connect())
cls.mongodb_available = True
print("\n✓ MongoDB连接成功")
except Exception as e:
cls.mongodb_available = False
print(f"\n✗ MongoDB连接失败: {e}")
def setUp(self):
if not self.mongodb_available:
self.skipTest("MongoDB不可用")
MongoDBConnection._instance = None
MongoDBConnection._client = None
MongoDBConnection._db = None
def tearDown(self):
if self.mongodb_available:
conn = MongoDBConnection()
asyncio.run(conn.close())
@classmethod
def tearDownClass(cls):
if cls.mongodb_available:
async def cleanup():
conn = MongoDBConnection()
db = await conn.get_db()
test_collections = [
"test_xhs_contents",
"test_xhs_comments",
"test_xhs_creators",
"test_douyin_contents",
"test_douyin_comments",
"test_douyin_creators"
]
for collection_name in test_collections:
try:
await db[collection_name].drop()
except:
pass
await conn.close()
try:
asyncio.run(cleanup())
print("\n✓ 测试数据清理完成")
except Exception as e:
print(f"\n✗ 清理测试数据时出错: {e}")
def test_real_connection(self):
async def test():
conn = MongoDBConnection()
client = await conn.get_client()
db = await conn.get_db()
self.assertIsNotNone(client)
self.assertIsNotNone(db)
result = await db.command("ping")
self.assertEqual(result.get("ok"), 1.0)
asyncio.run(test())
def test_real_save_and_query(self):
async def test():
store = MongoDBStoreBase(collection_prefix="test_xhs")
test_data = {
"note_id": "test_note_001",
"title": "测试笔记",
"content": "这是一条测试内容",
"created_at": datetime.now().isoformat()
}
result = await store.save_or_update(
"contents",
{"note_id": "test_note_001"},
test_data
)
self.assertTrue(result)
found = await store.find_one(
"contents",
{"note_id": "test_note_001"}
)
self.assertIsNotNone(found)
self.assertEqual(found["note_id"], "test_note_001")
self.assertEqual(found["title"], "测试笔记")
asyncio.run(test())
def test_real_update(self):
async def test():
store = MongoDBStoreBase(collection_prefix="test_xhs")
initial_data = {
"note_id": "test_note_002",
"title": "初始标题",
"likes": 10
}
await store.save_or_update(
"contents",
{"note_id": "test_note_002"},
initial_data
)
updated_data = {
"note_id": "test_note_002",
"title": "更新后的标题",
"likes": 100
}
await store.save_or_update(
"contents",
{"note_id": "test_note_002"},
updated_data
)
found = await store.find_one(
"contents",
{"note_id": "test_note_002"}
)
self.assertEqual(found["title"], "更新后的标题")
self.assertEqual(found["likes"], 100)
asyncio.run(test())
def test_real_find_many(self):
async def test():
store = MongoDBStoreBase(collection_prefix="test_xhs")
test_user_id = "test_user_123"
for i in range(5):
data = {
"note_id": f"test_note_{i:03d}",
"user_id": test_user_id,
"title": f"测试笔记{i}",
"likes": i * 10
}
await store.save_or_update(
"contents",
{"note_id": data["note_id"]},
data
)
results = await store.find_many(
"contents",
{"user_id": test_user_id}
)
self.assertGreaterEqual(len(results), 5)
limited_results = await store.find_many(
"contents",
{"user_id": test_user_id},
limit=3
)
self.assertEqual(len(limited_results), 3)
asyncio.run(test())
def test_real_create_index(self):
async def test():
store = MongoDBStoreBase(collection_prefix="test_xhs")
await store.create_index(
"contents",
[("note_id", 1)],
unique=True
)
collection = await store.get_collection("contents")
indexes = await collection.index_information()
self.assertIn("note_id_1", indexes)
asyncio.run(test())
def test_xhs_store_implementation(self):
async def test():
store = XhsMongoStoreImplement()
note_data = {
"note_id": "xhs_test_001",
"user_id": "user_001",
"nickname": "测试用户",
"title": "小红书测试笔记",
"desc": "这是一条测试笔记",
"type": "normal",
"liked_count": "100",
"collected_count": "50",
"comment_count": "20"
}
await store.store_content(note_data)
comment_data = {
"comment_id": "comment_001",
"note_id": "xhs_test_001",
"user_id": "user_002",
"nickname": "评论用户",
"content": "这是一条测试评论",
"like_count": "10"
}
await store.store_comment(comment_data)
creator_data = {
"user_id": "user_001",
"nickname": "测试创作者",
"desc": "这是一个测试创作者",
"fans": "1000",
"follows": "100"
}
await store.store_creator(creator_data)
mongo_store = store.mongo_store
note = await mongo_store.find_one("contents", {"note_id": "xhs_test_001"})
self.assertIsNotNone(note)
self.assertEqual(note["title"], "小红书测试笔记")
comment = await mongo_store.find_one("comments", {"comment_id": "comment_001"})
self.assertIsNotNone(comment)
self.assertEqual(comment["content"], "这是一条测试评论")
creator = await mongo_store.find_one("creators", {"user_id": "user_001"})
self.assertIsNotNone(creator)
self.assertEqual(creator["nickname"], "测试创作者")
asyncio.run(test())
def test_douyin_store_implementation(self):
async def test():
store = DouyinMongoStoreImplement()
video_data = {
"aweme_id": "dy_test_001",
"user_id": "user_001",
"nickname": "测试用户",
"title": "抖音测试视频",
"desc": "这是一条测试视频",
"liked_count": "1000",
"comment_count": "100"
}
await store.store_content(video_data)
comment_data = {
"comment_id": "dy_comment_001",
"aweme_id": "dy_test_001",
"user_id": "user_002",
"nickname": "评论用户",
"content": "这是一条测试评论"
}
await store.store_comment(comment_data)
creator_data = {
"user_id": "user_001",
"nickname": "测试创作者",
"desc": "这是一个测试创作者"
}
await store.store_creator(creator_data)
mongo_store = store.mongo_store
video = await mongo_store.find_one("contents", {"aweme_id": "dy_test_001"})
self.assertIsNotNone(video)
self.assertEqual(video["title"], "抖音测试视频")
comment = await mongo_store.find_one("comments", {"comment_id": "dy_comment_001"})
self.assertIsNotNone(comment)
creator = await mongo_store.find_one("creators", {"user_id": "user_001"})
self.assertIsNotNone(creator)
asyncio.run(test())
def test_concurrent_operations(self):
async def test():
store = MongoDBStoreBase(collection_prefix="test_xhs")
tasks = []
for i in range(10):
data = {
"note_id": f"concurrent_note_{i:03d}",
"title": f"并发测试笔记{i}",
"content": f"内容{i}"
}
task = store.save_or_update(
"contents",
{"note_id": data["note_id"]},
data
)
tasks.append(task)
results = await asyncio.gather(*tasks)
self.assertTrue(all(results))
for i in range(10):
found = await store.find_one(
"contents",
{"note_id": f"concurrent_note_{i:03d}"}
)
self.assertIsNotNone(found)
asyncio.run(test())
def run_integration_tests():
loader = unittest.TestLoader()
suite = unittest.TestSuite()
suite.addTests(loader.loadTestsFromTestCase(TestMongoDBRealConnection))
runner = unittest.TextTestRunner(verbosity=2)
result = runner.run(suite)
return result
if __name__ == "__main__":
print("="*70)
print("MongoDB存储集成测试")
print("="*70)
print(f"MongoDB配置:")
print(f" Host: {db_config.MONGODB_HOST}")
print(f" Port: {db_config.MONGODB_PORT}")
print(f" Database: {db_config.MONGODB_DB_NAME}")
print("="*70)
result = run_integration_tests()
print("\n" + "="*70)
print("测试统计:")
print(f"总测试数: {result.testsRun}")
print(f"成功: {result.testsRun - len(result.failures) - len(result.errors)}")
print(f"失败: {len(result.failures)}")
print(f"错误: {len(result.errors)}")
print(f"跳过: {len(result.skipped)}")
print("="*70)
sys.exit(0 if result.wasSuccessful() else 1)

View File

@@ -5,13 +5,16 @@ import os
import pathlib import pathlib
from typing import Dict, List from typing import Dict, List
import aiofiles import aiofiles
import config
from tools.utils import utils from tools.utils import utils
from tools.words import AsyncWordCloudGenerator
class AsyncFileWriter: class AsyncFileWriter:
def __init__(self, platform: str, crawler_type: str): def __init__(self, platform: str, crawler_type: str):
self.lock = asyncio.Lock() self.lock = asyncio.Lock()
self.platform = platform self.platform = platform
self.crawler_type = crawler_type self.crawler_type = crawler_type
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
def _get_file_path(self, file_type: str, item_type: str) -> str: def _get_file_path(self, file_type: str, item_type: str) -> str:
base_path = f"data/{self.platform}/{file_type}" base_path = f"data/{self.platform}/{file_type}"
@@ -47,4 +50,58 @@ class AsyncFileWriter:
existing_data.append(item) existing_data.append(item)
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4)) await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
async def generate_wordcloud_from_comments(self):
"""
Generate wordcloud from comments data
Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
"""
if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
return
if not self.wordcloud_generator:
return
try:
# Read comments from JSON file
comments_file_path = self._get_file_path('json', 'comments')
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
return
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
content = await f.read()
if not content:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
return
comments_data = json.loads(content)
if not isinstance(comments_data, list):
comments_data = [comments_data]
# Filter comments data to only include 'content' field
# Handle different comment data structures across platforms
filtered_data = []
for comment in comments_data:
if isinstance(comment, dict):
# Try different possible content field names
content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
if content_text:
filtered_data.append({'content': content_text})
if not filtered_data:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
return
# Generate wordcloud
words_base_path = f"data/{self.platform}/words"
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
except Exception as e:
utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")

View File

@@ -127,23 +127,24 @@ class BrowserLauncher:
"--disable-hang-monitor", "--disable-hang-monitor",
"--disable-prompt-on-repost", "--disable-prompt-on-repost",
"--disable-sync", "--disable-sync",
"--disable-web-security", # 可能有助于某些网站的访问
"--disable-features=VizDisplayCompositor",
"--disable-dev-shm-usage", # 避免共享内存问题 "--disable-dev-shm-usage", # 避免共享内存问题
"--no-sandbox", # 在CDP模式下关闭沙箱 "--no-sandbox", # 在CDP模式下关闭沙箱
# 🔥 关键反检测参数
"--disable-blink-features=AutomationControlled", # 禁用自动化控制标记
"--exclude-switches=enable-automation", # 排除自动化开关
"--disable-infobars", # 禁用信息栏
] ]
# 无头模式 # 无头模式
if headless: if headless:
args.extend([ args.extend([
"--headless", "--headless=new", # 使用新的headless模式
"--disable-gpu", "--disable-gpu",
]) ])
else: else:
# 非无头模式下也保持一些稳定性参数 # 非无头模式的额外参数
args.extend([ args.extend([
"--disable-blink-features=AutomationControlled", "--start-maximized", # 最大化窗口,更像真实用户
"--disable-infobars",
]) ])
# 用户数据目录 # 用户数据目录

View File

@@ -13,6 +13,8 @@ import os
import asyncio import asyncio
import socket import socket
import httpx import httpx
import signal
import atexit
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
from playwright.async_api import Browser, BrowserContext, Playwright from playwright.async_api import Browser, BrowserContext, Playwright
@@ -31,6 +33,40 @@ class CDPBrowserManager:
self.browser: Optional[Browser] = None self.browser: Optional[Browser] = None
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.debug_port: Optional[int] = None self.debug_port: Optional[int] = None
self._cleanup_registered = False
def _register_cleanup_handlers(self):
"""
注册清理处理器,确保程序退出时清理浏览器进程
"""
if self._cleanup_registered:
return
def sync_cleanup():
"""同步清理函数用于atexit"""
if self.launcher and self.launcher.browser_process:
utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程")
self.launcher.cleanup()
# 注册atexit清理
atexit.register(sync_cleanup)
# 注册信号处理器
def signal_handler(signum, frame):
"""信号处理器"""
utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程")
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
# 重新引发KeyboardInterrupt以便正常退出流程
if signum == signal.SIGINT:
raise KeyboardInterrupt
# 注册SIGINT (Ctrl+C) 和 SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
self._cleanup_registered = True
utils.logger.info("[CDPBrowserManager] 清理处理器已注册")
async def launch_and_connect( async def launch_and_connect(
self, self,
@@ -52,7 +88,10 @@ class CDPBrowserManager:
# 3. 启动浏览器 # 3. 启动浏览器
await self._launch_browser(browser_path, headless) await self._launch_browser(browser_path, headless)
# 4. 通过CDP连接 # 4. 注册清理处理器(确保异常退出时也能清理)
self._register_cleanup_handlers()
# 5. 通过CDP连接
await self._connect_via_cdp(playwright) await self._connect_via_cdp(playwright)
# 5. 创建浏览器上下文 # 5. 创建浏览器上下文
@@ -285,38 +324,67 @@ class CDPBrowserManager:
return [] return []
return [] return []
async def cleanup(self): async def cleanup(self, force: bool = False):
""" """
清理资源 清理资源
Args:
force: 是否强制清理浏览器进程忽略AUTO_CLOSE_BROWSER配置
""" """
try: try:
# 关闭浏览器上下文 # 关闭浏览器上下文
if self.browser_context: if self.browser_context:
try: try:
await self.browser_context.close() # 检查上下文是否已经关闭
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭") # 尝试获取页面列表,如果失败说明已经关闭
try:
pages = self.browser_context.pages
if pages is not None:
await self.browser_context.close()
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
except:
utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭")
except Exception as context_error: except Exception as context_error:
utils.logger.warning( # 只在错误不是因为已关闭时才记录警告
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}" error_msg = str(context_error).lower()
) if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}")
finally: finally:
self.browser_context = None self.browser_context = None
# 断开浏览器连接 # 断开浏览器连接
if self.browser: if self.browser:
try: try:
await self.browser.close() # 检查浏览器是否仍然连接
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开") if self.browser.is_connected():
await self.browser.close()
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
else:
utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开")
except Exception as browser_error: except Exception as browser_error:
utils.logger.warning( # 只在错误不是因为已关闭时才记录警告
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}" error_msg = str(browser_error).lower()
) if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}")
finally: finally:
self.browser = None self.browser = None
# 关闭浏览器进程(如果配置为自动关闭) # 关闭浏览器进程
if config.AUTO_CLOSE_BROWSER: # force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置
self.launcher.cleanup() # 这用于处理异常退出或手动清理的情况
if force or config.AUTO_CLOSE_BROWSER:
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
else:
utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程")
else: else:
utils.logger.info( utils.logger.info(
"[CDPBrowserManager] 浏览器进程保持运行AUTO_CLOSE_BROWSER=False" "[CDPBrowserManager] 浏览器进程保持运行AUTO_CLOSE_BROWSER=False"

View File

@@ -120,14 +120,7 @@ def get_user_agent() -> str:
def get_mobile_user_agent() -> str: def get_mobile_user_agent() -> str:
ua_list = [ ua_list = [
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Mobile/15E148 Safari/604.1"
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
"Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
"Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36"
] ]
return random.choice(ua_list) return random.choice(ua_list)

View File

@@ -26,6 +26,10 @@ def init_loging_config():
) )
_logger = logging.getLogger("MediaCrawler") _logger = logging.getLogger("MediaCrawler")
_logger.setLevel(level) _logger.setLevel(level)
# 关闭 httpx 的 INFO 日志
logging.getLogger("httpx").setLevel(logging.WARNING)
return _logger return _logger

95
uv.lock generated
View File

@@ -349,6 +349,15 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30" }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30" },
] ]
[[package]]
name = "dnspython"
version = "2.8.0"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/8b/57666417c0f90f08bcafa776861060426765fdb422eb10212086fb811d26/dnspython-2.8.0.tar.gz", hash = "sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f", size = 368251 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094 },
]
[[package]] [[package]]
name = "fastapi" name = "fastapi"
version = "0.110.2" version = "0.110.2"
@@ -730,6 +739,7 @@ dependencies = [
{ name = "httpx" }, { name = "httpx" },
{ name = "jieba" }, { name = "jieba" },
{ name = "matplotlib" }, { name = "matplotlib" },
{ name = "motor" },
{ name = "opencv-python" }, { name = "opencv-python" },
{ name = "pandas" }, { name = "pandas" },
{ name = "parsel" }, { name = "parsel" },
@@ -746,6 +756,7 @@ dependencies = [
{ name = "typer" }, { name = "typer" },
{ name = "uvicorn" }, { name = "uvicorn" },
{ name = "wordcloud" }, { name = "wordcloud" },
{ name = "xhshow" },
] ]
[package.metadata] [package.metadata]
@@ -760,6 +771,7 @@ requires-dist = [
{ name = "httpx", specifier = "==0.28.1" }, { name = "httpx", specifier = "==0.28.1" },
{ name = "jieba", specifier = "==0.42.1" }, { name = "jieba", specifier = "==0.42.1" },
{ name = "matplotlib", specifier = "==3.9.0" }, { name = "matplotlib", specifier = "==3.9.0" },
{ name = "motor", specifier = ">=3.3.0" },
{ name = "opencv-python", specifier = ">=4.11.0.86" }, { name = "opencv-python", specifier = ">=4.11.0.86" },
{ name = "pandas", specifier = "==2.2.3" }, { name = "pandas", specifier = "==2.2.3" },
{ name = "parsel", specifier = "==1.9.1" }, { name = "parsel", specifier = "==1.9.1" },
@@ -776,6 +788,19 @@ requires-dist = [
{ name = "typer", specifier = ">=0.12.3" }, { name = "typer", specifier = ">=0.12.3" },
{ name = "uvicorn", specifier = "==0.29.0" }, { name = "uvicorn", specifier = "==0.29.0" },
{ name = "wordcloud", specifier = "==1.9.3" }, { name = "wordcloud", specifier = "==1.9.3" },
{ name = "xhshow", specifier = ">=0.1.3" },
]
[[package]]
name = "motor"
version = "3.7.1"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "pymongo" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/93/ae/96b88362d6a84cb372f7977750ac2a8aed7b2053eed260615df08d5c84f4/motor-3.7.1.tar.gz", hash = "sha256:27b4d46625c87928f331a6ca9d7c51c2f518ba0e270939d395bc1ddc89d64526", size = 280997 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/9a/35e053d4f442addf751ed20e0e922476508ee580786546d699b0567c4c67/motor-3.7.1-py3-none-any.whl", hash = "sha256:8a63b9049e38eeeb56b4fdd57c3312a6d1f25d01db717fe7d82222393c410298", size = 74996 },
] ]
[[package]] [[package]]
@@ -1058,6 +1083,67 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/9e/11/a1938340ecb32d71e47ad4914843775011e6e9da59ba1229f181fef3119e/pyhumps-3.8.0-py3-none-any.whl", hash = "sha256:060e1954d9069f428232a1adda165db0b9d8dfdce1d265d36df7fbff540acfd6", size = 6095 }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9e/11/a1938340ecb32d71e47ad4914843775011e6e9da59ba1229f181fef3119e/pyhumps-3.8.0-py3-none-any.whl", hash = "sha256:060e1954d9069f428232a1adda165db0b9d8dfdce1d265d36df7fbff540acfd6", size = 6095 },
] ]
[[package]]
name = "pymongo"
version = "4.15.3"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "dnspython" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9d/7b/a709c85dc716eb85b69f71a4bb375cf1e72758a7e872103f27551243319c/pymongo-4.15.3.tar.gz", hash = "sha256:7a981271347623b5319932796690c2d301668ac3a1965974ac9f5c3b8a22cea5", size = 2470801 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/04/3dbc426c5868961d8308f19750243f8472f587f5f8a5029ce6953ba74b82/pymongo-4.15.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a13d8f7141294404ce46dfbabb2f2d17e9b1192456651ae831fa351f86fbeb", size = 865889 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/39/7f7652f53dd0eb0c4c3420a175183da757e9c53f9a2bf3ebc589758a1b9e/pymongo-4.15.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:17d13458baf4a6a9f2e787d95adf8ec50d412accb9926a044bd1c41029c323b2", size = 866230 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/0b/84e119e6bab7b19cf4fa1ebb9b4c29bf6c0e76521ed8221b44e3f94a3a37/pymongo-4.15.3-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:fe4bcb8acfb288e238190397d4a699aeb4adb70e8545a6f4e44f99d4e8096ab1", size = 1429788 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/39/9905fcb99903de6ac8483114d1c85efe56bc5df735857bdfcc372cf8a3ec/pymongo-4.15.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d09d895c7f08bcbed4d2e96a00e52e9e545ae5a37b32d2dc10099b205a21fc6d", size = 1456758 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/08/58/3c3ac32b8d6ebb654083d53f58e4621cd4c7f306b3b85acef667b80acf08/pymongo-4.15.3-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:21c0a95a4db72562fd0805e2f76496bf432ba2e27a5651f4b9c670466260c258", size = 1514666 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/19/e2/52f41de224218dc787b7e1187a1ca1a51946dcb979ee553ec917745ccd8d/pymongo-4.15.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:89e45d7fa987f4e246cdf43ff001e3f911f73eb19ba9dabc2a6d80df5c97883b", size = 1500703 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/0d/a5271073339ba6fc8a5f4e3a62baaa5dd8bf35246c37b512317e2a22848e/pymongo-4.15.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1246a82fa6dd73ac2c63aa7e463752d5d1ca91e0c7a23396b78f21273befd3a7", size = 1452013 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/a0/3b/f39b721ca0db9f0820e12eeffec84eb87b7502abb13a685226c5434f9618/pymongo-4.15.3-cp311-cp311-win32.whl", hash = "sha256:9483521c03f6017336f54445652ead3145154e8d3ea06418e52cea57fee43292", size = 844461 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/72/e58b9df862edbf238a1d71fa32749a6eaf30a3f60289602681351c29093a/pymongo-4.15.3-cp311-cp311-win_amd64.whl", hash = "sha256:c57dad9f289d72af1d7c47a444c4d9fa401f951cedbbcc54c7dd0c2107d6d786", size = 859200 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/81/8f/64c15df5e87de759412c3b962950561202c9b39e5cc604061e056043e163/pymongo-4.15.3-cp311-cp311-win_arm64.whl", hash = "sha256:2fd3b99520f2bb013960ac29dece1b43f2f1b6d94351ca33ba1b1211ecf79a09", size = 848372 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/92/7491a2046b41bfd3641da0a23529c88e27eac67c681de3cd9fbef4113d38/pymongo-4.15.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bd0497c564b0ae34fb816464ffc09986dd9ca29e2772a0f7af989e472fecc2ad", size = 920953 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ce/0c/98864cbfa8fbc954ae7480c91a35f0dc4e3339dab0c55f669e4dbeac808f/pymongo-4.15.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:292fd5a3f045751a823a54cdea75809b2216a62cc5f74a1a96b337db613d46a8", size = 920690 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b8/a6/7dc8043a10a1c30153be2d6847ab37911b169d53a6b05d21871b35b3de82/pymongo-4.15.3-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:959ef69c5e687b6b749fbf2140c7062abdb4804df013ae0507caabf30cba6875", size = 1690357 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/96/3d85da60094d2022217f2849e1b61a79af9d51ed8d05455d7413d68ab88e/pymongo-4.15.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de3bc878c3be54ae41c2cabc9e9407549ed4fec41f4e279c04e840dddd7c630c", size = 1726102 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/fd/dfd6ddee0330171f2f52f7e5344c02d25d2dd8dfa95ce0e5e413579f52fd/pymongo-4.15.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:07bcc36d11252f24fe671e7e64044d39a13d997b0502c6401161f28cc144f584", size = 1800630 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/1c/3b/e19a5f2de227ff720bc76c41d166d508e6fbe1096ba1ad18ade43b790b5e/pymongo-4.15.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b63bac343b79bd209e830aac1f5d9d552ff415f23a924d3e51abbe3041265436", size = 1785478 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/d2/927c9b1383c6708fc50c3700ecb1c2876e67dde95ad5fb1d29d04e8ac083/pymongo-4.15.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b33d59bf6fa1ca1d7d96d4fccff51e41312358194190d53ef70a84c070f5287e", size = 1718548 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/fe/10/a63592d1445f894b18d04865c2d4c235e2261f3d63f31f45ba4fe0486ec4/pymongo-4.15.3-cp312-cp312-win32.whl", hash = "sha256:b3a0ec660d61efb91c16a5962ec937011fe3572c4338216831f102e53d294e5c", size = 891301 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/be/ba/a8fdc43044408ed769c83108fa569aa52ee87968bdbf1e2ea142b109c268/pymongo-4.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:f6b0513e5765fdde39f36e6a29a36c67071122b5efa748940ae51075beb5e4bc", size = 910928 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/61/d53c17fdfaa9149864ab1fa84436ae218b72c969f00e4c124e017e461ce6/pymongo-4.15.3-cp312-cp312-win_arm64.whl", hash = "sha256:c4fdd8e6eab8ff77c1c8041792b5f760d48508623cd10b50d5639e73f1eec049", size = 896347 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/a4/e1ce9d408a1c1bcb1554ff61251b108e16cefd7db91b33faa2afc92294de/pymongo-4.15.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a47a3218f7900f65bf0f36fcd1f2485af4945757360e7e143525db9d715d2010", size = 975329 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/3c/6796f653d22be43cc0b13c07dbed84133eebbc334ebed4426459b7250163/pymongo-4.15.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:09440e78dff397b2f34a624f445ac8eb44c9756a2688b85b3bf344d351d198e1", size = 975129 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/88/33/22453dbfe11031e89c9cbdfde6405c03960daaf5da1b4dfdd458891846b5/pymongo-4.15.3-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:97f9babdb98c31676f97d468f7fe2dc49b8a66fb6900effddc4904c1450196c8", size = 1950979 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/07/094598e403112e2410a3376fb7845c69e2ec2dfc5ab5cc00b29dc2d26559/pymongo-4.15.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:71413cd8f091ae25b1fec3af7c2e531cf9bdb88ce4079470e64835f6a664282a", size = 1995271 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/47/9a/29e44f3dee68defc56e50ed7c9d3802ebf967ab81fefb175d8d729c0f276/pymongo-4.15.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:76a8d4de8dceb69f6e06736198ff6f7e1149515ef946f192ff2594d2cc98fc53", size = 2086587 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/d5/e9ff16aa57f671349134475b904fd431e7b86e152b01a949aef4f254b2d5/pymongo-4.15.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:77353978be9fc9e5fe56369682efed0aac5f92a2a1570704d62b62a3c9e1a24f", size = 2070201 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/d6/a3/820772c0b2bbb671f253cfb0bede4cf694a38fb38134f3993d491e23ec11/pymongo-4.15.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9897a837677e3814873d0572f7e5d53c23ce18e274f3b5b87f05fb6eea22615b", size = 1985260 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/6e/7b/365ac821aefad7e8d36a4bc472a94429449aade1ccb7805d9ca754df5081/pymongo-4.15.3-cp313-cp313-win32.whl", hash = "sha256:d66da207ccb0d68c5792eaaac984a0d9c6c8ec609c6bcfa11193a35200dc5992", size = 938122 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/80/f3/5ca27e1765fa698c677771a1c0e042ef193e207c15f5d32a21fa5b13d8c3/pymongo-4.15.3-cp313-cp313-win_amd64.whl", hash = "sha256:52f40c4b8c00bc53d4e357fe0de13d031c4cddb5d201e1a027db437e8d2887f8", size = 962610 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/48/7c/42f0b6997324023e94939f8f32b9a8dd928499f4b5d7b4412905368686b5/pymongo-4.15.3-cp313-cp313-win_arm64.whl", hash = "sha256:fb384623ece34db78d445dd578a52d28b74e8319f4d9535fbaff79d0eae82b3d", size = 944300 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/a3/d8aaf9c243ce1319bd2498004a9acccfcfb35a3ef9851abb856993d95255/pymongo-4.15.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:dcff15b9157c16bc796765d4d3d151df669322acfb0357e4c3ccd056153f0ff4", size = 1029873 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/10/91fd7791425ed3b56cbece6c23a36fb2696706a695655d8ea829e5e23c3a/pymongo-4.15.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1f681722c9f27e86c49c2e8a838e61b6ecf2285945fd1798bd01458134257834", size = 1029611 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/bb/9c/d9cf8d8a181f96877bca7bdec3e6ce135879d5e3d78694ea465833c53a3f/pymongo-4.15.3-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2c96dde79bdccd167b930a709875b0cd4321ac32641a490aebfa10bdcd0aa99b", size = 2211827 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c2/40/12703964305216c155284100124222eaa955300a07d426c6e0ba3c9cbade/pymongo-4.15.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d2d4ca446348d850ac4a5c3dc603485640ae2e7805dbb90765c3ba7d79129b37", size = 2264654 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/70/bf3c18b5d0cae0b9714158b210b07b5891a875eb1c503271cfe045942fd3/pymongo-4.15.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7c0fd3de3a12ff0a8113a3f64cedb01f87397ab8eaaffa88d7f18ca66cd39385", size = 2371830 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/21/6d/2dfaed2ae66304ab842d56ed9a1bd2706ca0ecf97975b328a5eeceb2a4c0/pymongo-4.15.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:e84dec392cf5f72d365e0aac73f627b0a3170193ebb038c3f7e7df11b7983ee7", size = 2351878 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/ed/fe46ff9adfa6dc11ad2e0694503adfc98f40583cfcc6db4dbaf582f0e357/pymongo-4.15.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8d4b01a48369ea6d5bc83fea535f56279f806aa3e4991189f0477696dd736289", size = 2251356 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/c4/2e1a10b1e9bca9c106f2dc1b89d4ad70c63d387c194b3a1bfcca552b5a3f/pymongo-4.15.3-cp314-cp314-win32.whl", hash = "sha256:3561fa96c3123275ec5ccf919e595547e100c412ec0894e954aa0da93ecfdb9e", size = 992878 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/98/b5/14aa417a44ea86d4c31de83b26f6e6793f736cd60e7e7fda289ce5184bdf/pymongo-4.15.3-cp314-cp314-win_amd64.whl", hash = "sha256:9df2db6bd91b07400879b6ec89827004c0c2b55fc606bb62db93cafb7677c340", size = 1021209 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/9f/1097c6824fa50a4ffb11ba5194d2a9ef68d5509dd342e32ddb697d2efe4e/pymongo-4.15.3-cp314-cp314-win_arm64.whl", hash = "sha256:ff99864085d2c7f4bb672c7167680ceb7d273e9a93c1a8074c986a36dbb71cc6", size = 1000618 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ad/31/37c76607a4f793f4491611741fa7a7c4238b956f48c4a9505cea0b5cf7ef/pymongo-4.15.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:ffe217d2502f3fba4e2b0dc015ce3b34f157b66dfe96835aa64432e909dd0d95", size = 1086576 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/92/b2/6d17d279cdd293eeeb0c9d5baeb4f8cdebb45354fd81cfcef2d1c69303ab/pymongo-4.15.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:390c4954c774eda280898e73aea36482bf20cba3ecb958dbb86d6a68b9ecdd68", size = 1086656 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/55/fd/c5da8619beca207d7e6231f24ed269cb537c5311dad59fd9f2ef7d43204a/pymongo-4.15.3-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7dd2a49f088890ca08930bbf96121443b48e26b02b84ba0a3e1ae2bf2c5a9b48", size = 2531646 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/93/8f/66a7e12b874f41eb205f352b3a719e5a964b5ba103996f6ac45e80560111/pymongo-4.15.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f6feb678f26171f2a6b2cbb340949889154c7067972bd4cc129b62161474f08", size = 2603799 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/10/98/baf0d1f8016087500899cc4ae14e591f29b016c643e99ab332fcafe6f7bc/pymongo-4.15.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:446417a34ff6c2411ce3809e17ce9a67269c9f1cb4966b01e49e0c590cc3c6b3", size = 2725238 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c9/a2/112d8d3882d6e842f501e166fbe08dfc2bc9a35f8773cbcaa804f7991043/pymongo-4.15.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:cfa4a0a0f024a0336640e1201994e780a17bda5e6a7c0b4d23841eb9152e868b", size = 2704837 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/38/fe/043a9aac7b3fba5b8e216f48359bd18fdbe46a4d93b081786f773b25e997/pymongo-4.15.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b03db2fe37c950aff94b29ded5c349b23729bccd90a0a5907bbf807d8c77298", size = 2582294 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/fe/7a6a6b331d9f2024ab171028ab53d5d9026959b1d713fe170be591a4d9a8/pymongo-4.15.3-cp314-cp314t-win32.whl", hash = "sha256:e7cde58ef6470c0da922b65e885fb1ffe04deef81e526bd5dea429290fa358ca", size = 1043993 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/70/c8/bc64321711e19bd48ea3371f0082f10295c433833245d73e7606d3b9afbe/pymongo-4.15.3-cp314-cp314t-win_amd64.whl", hash = "sha256:fae552767d8e5153ed498f1bca92d905d0d46311d831eefb0f06de38f7695c95", size = 1078481 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/39/31/2bb2003bb978eb25dfef7b5f98e1c2d4a86e973e63b367cc508a9308d31c/pymongo-4.15.3-cp314-cp314t-win_arm64.whl", hash = "sha256:47ffb068e16ae5e43580d5c4e3b9437f05414ea80c32a1e5cac44a835859c259", size = 1051179 },
]
[[package]] [[package]]
name = "pymysql" name = "pymysql"
version = "1.1.1" version = "1.1.1"
@@ -1323,3 +1409,12 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/c0/bc14fd7fa96e5b544aac4e9e65b5dd6f753d72184da35e35eb0b24c4dde4/wordcloud-1.9.3-cp312-cp312-win32.whl", hash = "sha256:419acfe0b1d1227b9e3e14ec1bb6c40fd7fa652df4adf81f0ba3e00daca500b5", size = 291251 }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/c0/bc14fd7fa96e5b544aac4e9e65b5dd6f753d72184da35e35eb0b24c4dde4/wordcloud-1.9.3-cp312-cp312-win32.whl", hash = "sha256:419acfe0b1d1227b9e3e14ec1bb6c40fd7fa652df4adf81f0ba3e00daca500b5", size = 291251 },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/a0/b8fa5f2d7147a7675e2cab99108f7d8d524b67481f81f289cdb2b64ed1ab/wordcloud-1.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:2061a9978a6243107ce1a8a9fa24f421b03a0f7e620769b6f5075857e75aa615", size = 301393 }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/a0/b8fa5f2d7147a7675e2cab99108f7d8d524b67481f81f289cdb2b64ed1ab/wordcloud-1.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:2061a9978a6243107ce1a8a9fa24f421b03a0f7e620769b6f5075857e75aa615", size = 301393 },
] ]
[[package]]
name = "xhshow"
version = "0.1.3"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/37/5b/0f80000832e91fed4c9f0baae8fcf291c61d7f624d4cff458749fd34a603/xhshow-0.1.3.tar.gz", hash = "sha256:ae1c7afd32c87aaacbc39de68db4bfc8bd4fea9a9bb2e2831e71ccac5d58b08d", size = 28965 }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/89/dfea5736fb564c3941fe67014dac4995af8f67b9bdb315cf700647a5d645/xhshow-0.1.3-py3-none-any.whl", hash = "sha256:4705f7158dad276022af81fb9c1387dfe68338ab210a87ab6adc5235996bd70f", size = 16764 },
]