fix: xhs sub comment bugfix #769

This commit is contained in:
程序员阿江(Relakkes)
2025-11-17 11:47:33 +08:00
parent b6caa7a85e
commit a1c5e07df8
2 changed files with 55 additions and 39 deletions

View File

@@ -88,8 +88,7 @@ async def main():
def cleanup(): def cleanup():
if crawler: if crawler:
# asyncio.run(crawler.close())
pass pass
if config.SAVE_DATA_OPTION in ["db", "sqlite"]: if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
asyncio.run(db.close()) asyncio.run(db.close())

View File

@@ -12,7 +12,8 @@ import asyncio
import json import json
import time import time
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode from urllib.parse import urlencode, urlparse, parse_qs
import httpx import httpx
from playwright.async_api import BrowserContext, Page from playwright.async_api import BrowserContext, Page
@@ -56,48 +57,49 @@ class XiaoHongShuClient(AbstractApiClient):
# 初始化 xhshow 客户端用于签名生成 # 初始化 xhshow 客户端用于签名生成
self._xhshow_client = Xhshow() self._xhshow_client = Xhshow()
async def _pre_headers(self, url: str, data=None) -> Dict: async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
""" """请求头参数签名
请求头参数签名,使用 xhshow 库生成签名
Args: Args:
url: 完整的 URIGET 请求包含查询参数 url: 请求的URL(GET请求包含请求的参数)
data: POST 请求的请求体数据 params: GET请求的参数
payload: POST请求的参数
Returns: Returns:
Dict: 请求头参数签名
""" """
# 获取 a1 cookie 值
a1_value = self.cookie_dict.get("a1", "") a1_value = self.cookie_dict.get("a1", "")
parsed = urlparse(url)
# 根据请求类型使用不同的签名方法 uri = parsed.path
if data is None: if params is not None:
# GET 请求:从 url 中提取参数 x_s = self._xhshow_client.sign_xs_get(
from urllib.parse import urlparse, parse_qs uri=uri, a1_value=a1_value, params=params
parsed = urlparse(url) )
params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()} elif payload is not None:
# 使用完整的 URL包含 host x_s = self._xhshow_client.sign_xs_post(
full_url = f"{self._host}{url}" uri=uri, a1_value=a1_value, payload=payload
x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params) )
else: else:
# POST 请求:使用 data 作为 payload raise ValueError("params or payload is required")
full_url = f"{self._host}{url}"
x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
# 尝试获取 b1 值(从 localStorage如果获取失败则使用空字符串 # 获取 b1 值
b1_value = "" b1_value = ""
try: try:
if self.playwright_page: if self.playwright_page:
local_storage = await self.playwright_page.evaluate("() => window.localStorage") local_storage = await self.playwright_page.evaluate(
"() => window.localStorage"
)
b1_value = local_storage.get("b1", "") b1_value = local_storage.get("b1", "")
except Exception as e: except Exception as e:
utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string") utils.logger.warning(
f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}"
)
# 使用 sign 函数生成其他签名头
signs = sign( signs = sign(
a1=a1_value, a1=a1_value,
b1=b1_value, b1=b1_value,
x_s=x_s, x_s=x_s,
x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳 x_t=str(int(time.time() * 1000)),
) )
headers = { headers = {
@@ -145,7 +147,7 @@ class XiaoHongShuClient(AbstractApiClient):
err_msg = data.get("msg", None) or f"{response.text}" err_msg = data.get("msg", None) or f"{response.text}"
raise DataFetchError(err_msg) raise DataFetchError(err_msg)
async def get(self, uri: str, params=None) -> Dict: async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
""" """
GET请求对请求头签名 GET请求对请求头签名
Args: Args:
@@ -155,12 +157,18 @@ class XiaoHongShuClient(AbstractApiClient):
Returns: Returns:
""" """
final_uri = uri headers = await self._pre_headers(uri, params)
if isinstance(params, dict): if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}" # 使用 xhsshow build_url 构建完整的 URL
headers = await self._pre_headers(final_uri) full_url = self._xhshow_client.build_url(
base_url=f"{self._host}{uri}",
params=params
)
else:
full_url = f"{self._host}{uri}"
return await self.request( return await self.request(
method="GET", url=f"{self._host}{final_uri}", headers=headers method="GET", url=full_url, headers=headers
) )
async def post(self, uri: str, data: dict, **kwargs) -> Dict: async def post(self, uri: str, data: dict, **kwargs) -> Dict:
@@ -173,8 +181,8 @@ class XiaoHongShuClient(AbstractApiClient):
Returns: Returns:
""" """
headers = await self._pre_headers(uri, data) headers = await self._pre_headers(uri, payload=data)
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) json_str = self._xhshow_client.build_json_body(payload=data)
return await self.request( return await self.request(
method="POST", method="POST",
url=f"{self._host}{uri}", url=f"{self._host}{uri}",
@@ -523,8 +531,15 @@ class XiaoHongShuClient(AbstractApiClient):
Returns: Returns:
""" """
uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}" uri = f"/api/sns/web/v1/user_posted"
return await self.get(uri) params = {
"num": page_size,
"cursor": cursor,
"user_id": creator,
"xsec_token": xsec_token,
"xsec_source": xsec_source,
}
return await self.get(uri, params)
async def get_all_notes_by_creator( async def get_all_notes_by_creator(
self, self,
@@ -550,7 +565,9 @@ class XiaoHongShuClient(AbstractApiClient):
notes_has_more = True notes_has_more = True
notes_cursor = "" notes_cursor = ""
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source) notes_res = await self.get_notes_by_creator(
user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source
)
if not notes_res: if not notes_res:
utils.logger.error( utils.logger.error(
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."