mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
Compare commits
3 Commits
54f23b8d1c
...
b6caa7a85e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b6caa7a85e | ||
|
|
1e3637f238 | ||
|
|
b5dab6d1e8 |
@@ -314,6 +314,7 @@ Nstbrowser 指纹浏览器 — 多账号运营&自动化管理的最佳解决方
|
||||
|
||||
## 📚 参考
|
||||
|
||||
- **小红书签名仓库**:[Cloxl 的 xhs 签名仓库](https://github.com/Cloxl/xhshow)
|
||||
- **小红书客户端**:[ReaJason 的 xhs 仓库](https://github.com/ReaJason/xhs)
|
||||
- **短信转发**:[SmsForwarder 参考仓库](https://github.com/pppscn/SmsForwarder)
|
||||
- **内网穿透工具**:[ngrok 官方文档](https://ngrok.com/docs/)
|
||||
|
||||
@@ -17,16 +17,13 @@ SORT_TYPE = "popularity_descending"
|
||||
|
||||
# 指定笔记URL列表, 必须要携带xsec_token参数
|
||||
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||
"https://www.xiaohongshu.com/explore/68f99f6d0000000007033fcf?xsec_token=ABZEzjuN2fPjKF9EcMsCCxfbt3IBRsFZldGFoCJbdDmXI=&xsec_source=pc_feed"
|
||||
"https://www.xiaohongshu.com/explore/64b95d01000000000c034587?xsec_token=AB0EFqJvINCkj6xOCKCQgfNNh8GdnBC_6XecG4QOddo3Q=&xsec_source=pc_cfeed"
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定创作者URL列表 (支持完整URL或纯ID)
|
||||
# 支持格式:
|
||||
# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||
# 2. 纯user_id: "63e36c9a000000002703502b"
|
||||
# 指定创作者URL列表,需要携带xsec_token和xsec_source参数
|
||||
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
|
||||
"63e36c9a000000002703502b",
|
||||
"https://www.xiaohongshu.com/user/profile/5f58bd990000000001003753?xsec_token=ABYVg1evluJZZzpMX-VWzchxQ1qSNVW3r-jOEnKqMcgZw=&xsec_source=pc_search"
|
||||
# ........................
|
||||
]
|
||||
|
||||
@@ -17,6 +17,7 @@ from urllib.parse import urlencode
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
from xhshow import Xhshow
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
@@ -27,7 +28,6 @@ from .exception import DataFetchError, IPBlockError
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import get_search_id, sign
|
||||
from .extractor import XiaoHongShuExtractor
|
||||
from .secsign import seccore_signv2_playwright
|
||||
|
||||
|
||||
class XiaoHongShuClient(AbstractApiClient):
|
||||
@@ -53,24 +53,51 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
self._extractor = XiaoHongShuExtractor()
|
||||
# 初始化 xhshow 客户端用于签名生成
|
||||
self._xhshow_client = Xhshow()
|
||||
|
||||
async def _pre_headers(self, url: str, data=None) -> Dict:
|
||||
"""
|
||||
请求头参数签名
|
||||
请求头参数签名,使用 xhshow 库生成签名
|
||||
Args:
|
||||
url:
|
||||
data:
|
||||
url: 完整的 URI(GET 请求包含查询参数)
|
||||
data: POST 请求的请求体数据
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
x_s = await seccore_signv2_playwright(self.playwright_page, url, data)
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
# 获取 a1 cookie 值
|
||||
a1_value = self.cookie_dict.get("a1", "")
|
||||
|
||||
# 根据请求类型使用不同的签名方法
|
||||
if data is None:
|
||||
# GET 请求:从 url 中提取参数
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
parsed = urlparse(url)
|
||||
params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
|
||||
# 使用完整的 URL(包含 host)
|
||||
full_url = f"{self._host}{url}"
|
||||
x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params)
|
||||
else:
|
||||
# POST 请求:使用 data 作为 payload
|
||||
full_url = f"{self._host}{url}"
|
||||
x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
|
||||
|
||||
# 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串
|
||||
b1_value = ""
|
||||
try:
|
||||
if self.playwright_page:
|
||||
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
|
||||
b1_value = local_storage.get("b1", "")
|
||||
except Exception as e:
|
||||
utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string")
|
||||
|
||||
# 使用 sign 函数生成其他签名头
|
||||
signs = sign(
|
||||
a1=self.cookie_dict.get("a1", ""),
|
||||
b1=local_storage.get("b1", ""),
|
||||
a1=a1_value,
|
||||
b1=b1_value,
|
||||
x_s=x_s,
|
||||
x_t=str(int(time.time())),
|
||||
x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳
|
||||
)
|
||||
|
||||
headers = {
|
||||
@@ -115,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
elif data["code"] == self.IP_ERROR_CODE:
|
||||
raise IPBlockError(self.IP_ERROR_STR)
|
||||
else:
|
||||
raise DataFetchError(data.get("msg", None))
|
||||
err_msg = data.get("msg", None) or f"{response.text}"
|
||||
raise DataFetchError(err_msg)
|
||||
|
||||
async def get(self, uri: str, params=None) -> Dict:
|
||||
"""
|
||||
@@ -480,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
creator: str,
|
||||
cursor: str,
|
||||
page_size: int = 30,
|
||||
xsec_token: str = "",
|
||||
xsec_source: str = "pc_feed",
|
||||
) -> Dict:
|
||||
"""
|
||||
获取博主的笔记
|
||||
@@ -487,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
creator: 博主ID
|
||||
cursor: 上一页最后一条笔记的ID
|
||||
page_size: 分页数据长度
|
||||
xsec_token: 验证token
|
||||
xsec_source: 渠道来源
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/api/sns/web/v1/user_posted"
|
||||
data = {
|
||||
"user_id": creator,
|
||||
"cursor": cursor,
|
||||
"num": page_size,
|
||||
"image_formats": "jpg,webp,avif",
|
||||
}
|
||||
return await self.get(uri, data)
|
||||
uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}"
|
||||
return await self.get(uri)
|
||||
|
||||
async def get_all_notes_by_creator(
|
||||
self,
|
||||
user_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
xsec_token: str = "",
|
||||
xsec_source: str = "pc_feed",
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
@@ -512,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
xsec_token: 验证token
|
||||
xsec_source: 渠道来源
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -520,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||
notes_has_more = True
|
||||
notes_cursor = ""
|
||||
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
|
||||
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
|
||||
notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||
if not notes_res:
|
||||
utils.logger.error(
|
||||
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
|
||||
|
||||
@@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
user_id=user_id,
|
||||
crawl_interval=crawl_interval,
|
||||
callback=self.fetch_creator_notes_detail,
|
||||
xsec_token=creator_info.xsec_token,
|
||||
xsec_source=creator_info.xsec_source,
|
||||
)
|
||||
|
||||
note_ids = []
|
||||
@@ -279,12 +281,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||
Dict: note detail
|
||||
"""
|
||||
note_detail = None
|
||||
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=True)
|
||||
try:
|
||||
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||
except RetryError:
|
||||
pass
|
||||
|
||||
if not note_detail:
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
|
||||
enable_cookie=True)
|
||||
if not note_detail:
|
||||
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
|
||||
|
||||
note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
|
||||
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
||||
# 1. 不得用于任何商业用途。
|
||||
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
||||
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
||||
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
||||
# 5. 不得用于任何非法或不当的用途。
|
||||
#
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import hashlib
|
||||
import base64
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
def _build_c(e: Any, a: Any) -> str:
|
||||
c = str(e)
|
||||
if isinstance(a, (dict, list)):
|
||||
c += json.dumps(a, separators=(",", ":"), ensure_ascii=False)
|
||||
elif isinstance(a, str):
|
||||
c += a
|
||||
# 其它类型不拼
|
||||
return c
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# p.Pu = MD5(c) => hex 小写
|
||||
# ---------------------------
|
||||
def _md5_hex(s: str) -> str:
|
||||
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Playwright 版本(异步):传入 page(Page 对象)
|
||||
# 内部用 page.evaluate('window.mnsv2(...)')
|
||||
# ============================================================
|
||||
async def seccore_signv2_playwright(
|
||||
page, # Playwright Page
|
||||
e: Any,
|
||||
a: Any,
|
||||
) -> str:
|
||||
"""
|
||||
使用 Playwright 的 page.evaluate 调用 window.mnsv2(c, d) 来生成签名。
|
||||
需确保 page 上下文中已存在 window.mnsv2(比如已注入目标站点脚本)。
|
||||
|
||||
用法:
|
||||
s = await page.evaluate("(c, d) => window.mnsv2(c, d)", c, d)
|
||||
"""
|
||||
c = _build_c(e, a)
|
||||
d = _md5_hex(c)
|
||||
|
||||
# 调用浏览器上下文里的 window.mnsv2
|
||||
s = await page.evaluate("(c, d) => window.mnsv2(c, d)", [c, d])
|
||||
f = {
|
||||
"x0": "4.2.6",
|
||||
"x1": "xhs-pc-web",
|
||||
"x2": "Mac OS",
|
||||
"x3": s,
|
||||
"x4": a,
|
||||
}
|
||||
payload = json.dumps(f, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
|
||||
token = "XYS_" + base64.b64encode(payload).decode("ascii")
|
||||
print(token)
|
||||
return token
|
||||
@@ -33,6 +33,7 @@ dependencies = [
|
||||
"typer>=0.12.3",
|
||||
"uvicorn==0.29.0",
|
||||
"wordcloud==1.9.3",
|
||||
"xhshow>=0.1.3",
|
||||
]
|
||||
|
||||
[[tool.uv.index]]
|
||||
|
||||
@@ -25,3 +25,4 @@ alembic>=1.16.5
|
||||
asyncmy>=0.2.10
|
||||
sqlalchemy>=2.0.43
|
||||
motor>=3.3.0
|
||||
xhshow>=0.1.3
|
||||
Reference in New Issue
Block a user