From 6dcfd7e0a5b2f1f9bdf215bd5077e15098ca2040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Mon, 17 Nov 2025 17:11:35 +0800 Subject: [PATCH] refactor: weibo login --- config/weibo_config.py | 2 +- media_platform/weibo/client.py | 27 ++++++++++++++++++++++++--- media_platform/weibo/core.py | 11 ++++++++--- tools/crawler_util.py | 9 +-------- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/config/weibo_config.py b/config/weibo_config.py index a8224ad..f89ea35 100644 --- a/config/weibo_config.py +++ b/config/weibo_config.py @@ -12,7 +12,7 @@ # 微博平台配置 # 搜索类型,具体的枚举值在media_platform/weibo/field.py中 -WEIBO_SEARCH_TYPE = "popular" +WEIBO_SEARCH_TYPE = "default" # 指定微博ID列表 WEIBO_SPECIFIED_ID_LIST = [ diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index ec9f289..0f89993 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -23,6 +23,7 @@ from urllib.parse import parse_qs, unquote, urlencode import httpx from httpx import Response from playwright.async_api import BrowserContext, Page +from tenacity import retry, stop_after_attempt, wait_fixed import config from tools import utils @@ -50,6 +51,7 @@ class WeiboClient: self.cookie_dict = cookie_dict self._image_agent_host = "https://i1.wp.com/" + @retry(stop=stop_after_attempt(5), wait=wait_fixed(2)) async def request(self, method, url, **kwargs) -> Union[Response, Dict]: enable_return_response = kwargs.pop("return_response", False) async with httpx.AsyncClient(proxy=self.proxy) as client: @@ -58,7 +60,12 @@ class WeiboClient: if enable_return_response: return response - data: Dict = response.json() + try: + data: Dict = response.json() + except json.decoder.JSONDecodeError: + utils.logger.error(f"[WeiboClient.request] request {method}:{url} err code: {response.status_code} res:{response.text}") + raise DataFetchError(f"get response code error: {response.status_code}") + ok_code = data.get("ok") if ok_code == 0: # response error utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}") @@ -99,10 +106,24 @@ class WeiboClient: ping_flag = False return ping_flag - async def update_cookies(self, browser_context: BrowserContext): - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + async def update_cookies(self, browser_context: BrowserContext, urls: Optional[List[str]] = None): + """ + Update cookies from browser context + :param browser_context: Browser context + :param urls: Optional list of URLs to filter cookies (e.g., ["https://m.weibo.cn"]) + If provided, only cookies for these URLs will be retrieved + """ + if urls: + cookies = await browser_context.cookies(urls=urls) + utils.logger.info(f"[WeiboClient.update_cookies] Updating cookies for specific URLs: {urls}") + else: + cookies = await browser_context.cookies() + utils.logger.info("[WeiboClient.update_cookies] Updating all cookies") + + cookie_str, cookie_dict = utils.convert_cookies(cookies) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict + utils.logger.info(f"[WeiboClient.update_cookies] Cookie updated successfully, total: {len(cookie_dict)} cookies") async def get_note_by_keyword( self, diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 2b1ac9f..6c05b4e 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -83,7 +83,8 @@ class WeiboCrawler(AbstractCrawler): self.context_page = await self.browser_context.new_page() - await self.context_page.goto(self.mobile_index_url) + await self.context_page.goto(self.index_url) + await asyncio.sleep(2) # Create a client to interact with the xiaohongshu website. self.wb_client = await self.create_weibo_client(httpx_proxy_format) @@ -100,8 +101,12 @@ class WeiboCrawler(AbstractCrawler): # 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform") await self.context_page.goto(self.mobile_index_url) - await asyncio.sleep(2) - await self.wb_client.update_cookies(browser_context=self.browser_context) + await asyncio.sleep(3) + # 只获取移动端的 cookies,避免 PC 端和移动端 cookies 混淆 + await self.wb_client.update_cookies( + browser_context=self.browser_context, + urls=[self.mobile_index_url] + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 06cf8c5..13141e8 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -120,14 +120,7 @@ def get_user_agent() -> str: def get_mobile_user_agent() -> str: ua_list = [ - "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1", - "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36", - "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0", - "Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36" + "Mozilla/5.0 (iPhone; CPU iPhone OS 18_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Mobile/15E148 Safari/604.1" ] return random.choice(ua_list)