# -*- coding: utf-8 -*- # Copyright (c) 2025 relakkes@gmail.com # # This file is part of MediaCrawler project. # Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/xhs/client.py # GitHub: https://github.com/NanmiCoder # Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 # # 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: # 1. 不得用于任何商业用途。 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 # 3. 不得进行大规模爬取或对平台造成运营干扰。 # 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 # # 详细许可条款请参阅项目根目录下的LICENSE文件。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 import asyncio import json import time from typing import Any, Callable, Dict, List, Optional, Union from urllib.parse import urlencode, urlparse, parse_qs import httpx from playwright.async_api import BrowserContext, Page from tenacity import retry, stop_after_attempt, wait_fixed from xhshow import Xhshow import config from base.base_crawler import AbstractApiClient from tools import utils from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType from .help import get_search_id, sign from .extractor import XiaoHongShuExtractor class XiaoHongShuClient(AbstractApiClient): def __init__( self, timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间 proxy=None, *, headers: Dict[str, str], playwright_page: Page, cookie_dict: Dict[str, str], ): self.proxy = proxy self.timeout = timeout self.headers = headers self._host = "https://edith.xiaohongshu.com" self._domain = "https://www.xiaohongshu.com" self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试" self.IP_ERROR_CODE = 300012 self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看" self.NOTE_ABNORMAL_CODE = -510001 self.playwright_page = playwright_page self.cookie_dict = cookie_dict self._extractor = XiaoHongShuExtractor() # 初始化 xhshow 客户端用于签名生成 self._xhshow_client = Xhshow() async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict: """请求头参数签名 Args: url: 请求的URL(GET请求是包含请求的参数) params: GET请求的参数 payload: POST请求的参数 Returns: Dict: 请求头参数签名 """ a1_value = self.cookie_dict.get("a1", "") parsed = urlparse(url) uri = parsed.path if params is not None: x_s = self._xhshow_client.sign_xs_get( uri=uri, a1_value=a1_value, params=params ) elif payload is not None: x_s = self._xhshow_client.sign_xs_post( uri=uri, a1_value=a1_value, payload=payload ) else: raise ValueError("params or payload is required") # 获取 b1 值 b1_value = "" try: if self.playwright_page: local_storage = await self.playwright_page.evaluate( "() => window.localStorage" ) b1_value = local_storage.get("b1", "") except Exception as e: utils.logger.warning( f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}" ) signs = sign( a1=a1_value, b1=b1_value, x_s=x_s, x_t=str(int(time.time() * 1000)), ) headers = { "X-S": signs["x-s"], "X-T": signs["x-t"], "x-S-Common": signs["x-s-common"], "X-B3-Traceid": signs["x-b3-traceid"], } self.headers.update(headers) return self.headers @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def request(self, method, url, **kwargs) -> Union[str, Any]: """ 封装httpx的公共请求方法,对请求响应做一些处理 Args: method: 请求方法 url: 请求的URL **kwargs: 其他请求参数,例如请求头、请求体等 Returns: """ # return response.text return_response = kwargs.pop("return_response", False) async with httpx.AsyncClient(proxy=self.proxy) as client: response = await client.request(method, url, timeout=self.timeout, **kwargs) if response.status_code == 471 or response.status_code == 461: # someday someone maybe will bypass captcha verify_type = response.headers["Verifytype"] verify_uuid = response.headers["Verifyuuid"] msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}" utils.logger.error(msg) raise Exception(msg) if return_response: return response.text data: Dict = response.json() if data["success"]: return data.get("data", data.get("success", {})) elif data["code"] == self.IP_ERROR_CODE: raise IPBlockError(self.IP_ERROR_STR) else: err_msg = data.get("msg", None) or f"{response.text}" raise DataFetchError(err_msg) async def get(self, uri: str, params: Optional[Dict] = None) -> Dict: """ GET请求,对请求头签名 Args: uri: 请求路由 params: 请求参数 Returns: """ headers = await self._pre_headers(uri, params) if isinstance(params, dict): # 使用 xhsshow build_url 构建完整的 URL full_url = self._xhshow_client.build_url( base_url=f"{self._host}{uri}", params=params ) else: full_url = f"{self._host}{uri}" return await self.request( method="GET", url=full_url, headers=headers ) async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ POST请求,对请求头签名 Args: uri: 请求路由 data: 请求体参数 Returns: """ headers = await self._pre_headers(uri, payload=data) json_str = self._xhshow_client.build_json_body(payload=data) return await self.request( method="POST", url=f"{self._host}{uri}", data=json_str, headers=headers, **kwargs, ) async def get_note_media(self, url: str) -> Union[bytes, None]: async with httpx.AsyncClient(proxy=self.proxy) as client: try: response = await client.request("GET", url, timeout=self.timeout) response.raise_for_status() if not response.reason_phrase == "OK": utils.logger.error( f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}" ) return None else: return response.content except ( httpx.HTTPError ) as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx utils.logger.error( f"[XiaoHongShuClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}" ) # 保留原始异常类型名称,以便开发者调试 return None async def pong(self) -> bool: """ 用于检查登录态是否失效了 Returns: """ """get a note to check if login state is ok""" utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...") ping_flag = False try: note_card: Dict = await self.get_note_by_keyword(keyword="小红书") if note_card.get("items"): ping_flag = True except Exception as e: utils.logger.error( f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again..." ) ping_flag = False return ping_flag async def update_cookies(self, browser_context: BrowserContext): """ API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 Args: browser_context: 浏览器上下文对象 Returns: """ cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict async def get_note_by_keyword( self, keyword: str, search_id: str = get_search_id(), page: int = 1, page_size: int = 20, sort: SearchSortType = SearchSortType.GENERAL, note_type: SearchNoteType = SearchNoteType.ALL, ) -> Dict: """ 根据关键词搜索笔记 Args: keyword: 关键词参数 page: 分页第几页 page_size: 分页数据长度 sort: 搜索结果排序指定 note_type: 搜索的笔记类型 Returns: """ uri = "/api/sns/web/v1/search/notes" data = { "keyword": keyword, "page": page, "page_size": page_size, "search_id": search_id, "sort": sort.value, "note_type": note_type.value, } return await self.post(uri, data) async def get_note_by_id( self, note_id: str, xsec_source: str, xsec_token: str, ) -> Dict: """ 获取笔记详情API Args: note_id:笔记ID xsec_source: 渠道来源 xsec_token: 搜索关键字之后返回的比较列表中返回的token Returns: """ if xsec_source == "": xsec_source = "pc_search" data = { "source_note_id": note_id, "image_formats": ["jpg", "webp", "avif"], "extra": {"need_body_topic": 1}, "xsec_source": xsec_source, "xsec_token": xsec_token, } uri = "/api/sns/web/v1/feed" res = await self.post(uri, data) if res and res.get("items"): res_dict: Dict = res["items"][0]["note_card"] return res_dict # 爬取频繁了可能会出现有的笔记能有结果有的没有 utils.logger.error( f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}" ) return dict() async def get_note_comments( self, note_id: str, xsec_token: str, cursor: str = "", ) -> Dict: """ 获取一级评论的API Args: note_id: 笔记ID xsec_token: 验证token cursor: 分页游标 Returns: """ uri = "/api/sns/web/v2/comment/page" params = { "note_id": note_id, "cursor": cursor, "top_comment_id": "", "image_formats": "jpg,webp,avif", "xsec_token": xsec_token, } return await self.get(uri, params) async def get_note_sub_comments( self, note_id: str, root_comment_id: str, xsec_token: str, num: int = 10, cursor: str = "", ): """ 获取指定父评论下的子评论的API Args: note_id: 子评论的帖子ID root_comment_id: 根评论ID xsec_token: 验证token num: 分页数量 cursor: 分页游标 Returns: """ uri = "/api/sns/web/v2/comment/sub/page" params = { "note_id": note_id, "root_comment_id": root_comment_id, "num": num, "cursor": cursor, "image_formats": "jpg,webp,avif", "top_comment_id": "", "xsec_token": xsec_token, } return await self.get(uri, params) async def get_note_all_comments( self, note_id: str, xsec_token: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, max_count: int = 10, ) -> List[Dict]: """ 获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: note_id: 笔记ID xsec_token: 验证token crawl_interval: 爬取一次笔记的延迟单位(秒) callback: 一次笔记爬取结束后 max_count: 一次笔记爬取的最大评论数量 Returns: """ result = [] comments_has_more = True comments_cursor = "" while comments_has_more and len(result) < max_count: comments_res = await self.get_note_comments( note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor ) comments_has_more = comments_res.get("has_more", False) comments_cursor = comments_res.get("cursor", "") if "comments" not in comments_res: utils.logger.info( f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}" ) break comments = comments_res["comments"] if len(result) + len(comments) > max_count: comments = comments[: max_count - len(result)] if callback: await callback(note_id, comments) await asyncio.sleep(crawl_interval) result.extend(comments) sub_comments = await self.get_comments_all_sub_comments( comments=comments, xsec_token=xsec_token, crawl_interval=crawl_interval, callback=callback, ) result.extend(sub_comments) return result async def get_comments_all_sub_comments( self, comments: List[Dict], xsec_token: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, ) -> List[Dict]: """ 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 Args: comments: 评论列表 xsec_token: 验证token crawl_interval: 爬取一次评论的延迟单位(秒) callback: 一次评论爬取结束后 Returns: """ if not config.ENABLE_GET_SUB_COMMENTS: utils.logger.info( f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled" ) return [] result = [] for comment in comments: note_id = comment.get("note_id") sub_comments = comment.get("sub_comments") if sub_comments and callback: await callback(note_id, sub_comments) sub_comment_has_more = comment.get("sub_comment_has_more") if not sub_comment_has_more: continue root_comment_id = comment.get("id") sub_comment_cursor = comment.get("sub_comment_cursor") while sub_comment_has_more: comments_res = await self.get_note_sub_comments( note_id=note_id, root_comment_id=root_comment_id, xsec_token=xsec_token, num=10, cursor=sub_comment_cursor, ) if comments_res is None: utils.logger.info( f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}" ) continue sub_comment_has_more = comments_res.get("has_more", False) sub_comment_cursor = comments_res.get("cursor", "") if "comments" not in comments_res: utils.logger.info( f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}" ) break comments = comments_res["comments"] if callback: await callback(note_id, comments) await asyncio.sleep(crawl_interval) result.extend(comments) return result async def get_creator_info( self, user_id: str, xsec_token: str = "", xsec_source: str = "" ) -> Dict: """ 通过解析网页版的用户主页HTML,获取用户个人简要信息 PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可 Args: user_id: 用户ID xsec_token: 验证token (可选,如果URL中包含此参数则传入) xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入) Returns: Dict: 创作者信息 """ # 构建URI,如果有xsec参数则添加到URL中 uri = f"/user/profile/{user_id}" if xsec_token and xsec_source: uri = f"{uri}?xsec_token={xsec_token}&xsec_source={xsec_source}" html_content = await self.request( "GET", self._domain + uri, return_response=True, headers=self.headers ) return self._extractor.extract_creator_info_from_html(html_content) async def get_notes_by_creator( self, creator: str, cursor: str, page_size: int = 30, xsec_token: str = "", xsec_source: str = "pc_feed", ) -> Dict: """ 获取博主的笔记 Args: creator: 博主ID cursor: 上一页最后一条笔记的ID page_size: 分页数据长度 xsec_token: 验证token xsec_source: 渠道来源 Returns: """ uri = f"/api/sns/web/v1/user_posted" params = { "num": page_size, "cursor": cursor, "user_id": creator, "xsec_token": xsec_token, "xsec_source": xsec_source, } return await self.get(uri, params) async def get_all_notes_by_creator( self, user_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, xsec_token: str = "", xsec_source: str = "pc_feed", ) -> List[Dict]: """ 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 Args: user_id: 用户ID crawl_interval: 爬取一次的延迟单位(秒) callback: 一次分页爬取结束后的更新回调函数 xsec_token: 验证token xsec_source: 渠道来源 Returns: """ result = [] notes_has_more = True notes_cursor = "" while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: notes_res = await self.get_notes_by_creator( user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source ) if not notes_res: utils.logger.error( f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." ) break notes_has_more = notes_res.get("has_more", False) notes_cursor = notes_res.get("cursor", "") if "notes" not in notes_res: utils.logger.info( f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}" ) break notes = notes_res["notes"] utils.logger.info( f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}" ) remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result) if remaining <= 0: break notes_to_add = notes[:remaining] if callback: await callback(notes_to_add) result.extend(notes_to_add) await asyncio.sleep(crawl_interval) utils.logger.info( f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}" ) return result async def get_note_short_url(self, note_id: str) -> Dict: """ 获取笔记的短链接 Args: note_id: 笔记ID Returns: """ uri = f"/api/sns/web/short_url" data = {"original_url": f"{self._domain}/discovery/item/{note_id}"} return await self.post(uri, data=data, return_response=True) @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def get_note_by_id_from_html( self, note_id: str, xsec_source: str, xsec_token: str, enable_cookie: bool = False, ) -> Optional[Dict]: """ 通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次 copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259 thanks for ReaJason Args: note_id: xsec_source: xsec_token: enable_cookie: Returns: """ url = ( "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}" ) copy_headers = self.headers.copy() if not enable_cookie: del copy_headers["Cookie"] html = await self.request( method="GET", url=url, return_response=True, headers=copy_headers ) return self._extractor.extract_note_detail_from_html(note_id, html)