mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
feat: update xhs sign
This commit is contained in:
@@ -17,7 +17,7 @@ SORT_TYPE = "popularity_descending"
|
||||
|
||||
# 指定笔记URL列表, 必须要携带xsec_token参数
|
||||
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||
"https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
||||
@@ -10,23 +10,24 @@
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
from html import unescape
|
||||
|
||||
|
||||
from .exception import DataFetchError, IPBlockError
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import get_search_id, sign
|
||||
from .extractor import XiaoHongShuExtractor
|
||||
from .secsign import seccore_signv2_playwright
|
||||
|
||||
|
||||
class XiaoHongShuClient(AbstractApiClient):
|
||||
@@ -63,15 +64,13 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
encrypt_params = await self.playwright_page.evaluate(
|
||||
"([url, data]) => window._webmsxyw(url,data)", [url, data]
|
||||
)
|
||||
x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
signs = sign(
|
||||
a1=self.cookie_dict.get("a1", ""),
|
||||
b1=local_storage.get("b1", ""),
|
||||
x_s=encrypt_params.get("X-s", ""),
|
||||
x_t=str(encrypt_params.get("X-t", "")),
|
||||
x_s=x_s,
|
||||
x_t=str(int(time.time())),
|
||||
)
|
||||
|
||||
headers = {
|
||||
|
||||
@@ -282,16 +282,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
||||
|
||||
try:
|
||||
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||
except RetryError:
|
||||
pass
|
||||
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
|
||||
if not note_detail:
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
|
||||
if not note_detail:
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
|
||||
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
||||
|
||||
|
||||
@@ -27,16 +27,17 @@ def sign(a1="", b1="", x_s="", x_t=""):
|
||||
"s0": 3, # getPlatformCode
|
||||
"s1": "",
|
||||
"x0": "1", # localStorage.getItem("b1b1")
|
||||
"x1": "3.7.8-2", # version
|
||||
"x1": "4.2.2", # version
|
||||
"x2": "Mac OS",
|
||||
"x3": "xhs-pc-web",
|
||||
"x4": "4.27.2",
|
||||
"x4": "4.74.0",
|
||||
"x5": a1, # cookie of a1
|
||||
"x6": x_t,
|
||||
"x7": x_s,
|
||||
"x8": b1, # localStorage.getItem("b1")
|
||||
"x9": mrc(x_t + x_s + b1),
|
||||
"x10": 154, # getSigCount
|
||||
"x11": "normal"
|
||||
}
|
||||
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
|
||||
x_s_common = b64Encode(encode_str)
|
||||
|
||||
66
media_platform/xhs/secsign.py
Normal file
66
media_platform/xhs/secsign.py
Normal file
@@ -0,0 +1,66 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import hashlib
|
||||
import base64
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
def _build_c(e: Any, a: Any) -> str:
|
||||
c = str(e)
|
||||
if isinstance(a, (dict, list)):
|
||||
c += json.dumps(a, separators=(",", ":"), ensure_ascii=False)
|
||||
elif isinstance(a, str):
|
||||
c += a
|
||||
# 其它类型不拼
|
||||
return c
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# p.Pu = MD5(c) => hex 小写
|
||||
# ---------------------------
|
||||
def _md5_hex(s: str) -> str:
|
||||
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Playwright 版本(异步):传入 page(Page 对象)
|
||||
# 内部用 page.evaluate('window.mnsv2(...)')
|
||||
# ============================================================
|
||||
async def seccore_signv2_playwright(
|
||||
page, # Playwright Page
|
||||
e: Any,
|
||||
a: Any,
|
||||
) -> str:
|
||||
"""
|
||||
使用 Playwright 的 page.evaluate 调用 window.mnsv2(c, d) 来生成签名。
|
||||
需确保 page 上下文中已存在 window.mnsv2(比如已注入目标站点脚本)。
|
||||
|
||||
用法:
|
||||
s = await page.evaluate("(c, d) => window.mnsv2(c, d)", c, d)
|
||||
"""
|
||||
c = _build_c(e, a)
|
||||
d = _md5_hex(c)
|
||||
|
||||
# 调用浏览器上下文里的 window.mnsv2
|
||||
s = await page.evaluate("(c, d) => window.mnsv2(c, d)", [c, d])
|
||||
f = {
|
||||
"x0": "4.2.6",
|
||||
"x1": "xhs-pc-web",
|
||||
"x2": "Mac OS",
|
||||
"x3": s,
|
||||
"x4": a,
|
||||
}
|
||||
payload = json.dumps(f, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
|
||||
token = "XYS_" + base64.b64encode(payload).decode("ascii")
|
||||
print(token)
|
||||
return token
|
||||
Reference in New Issue
Block a user