2025-11-18 12:24:02 +08:00
# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/weibo/client.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
2025-07-30 21:19:56 +08:00
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
2024-10-20 00:43:25 +08:00
# 5. 不得用于任何非法或不当的用途。
2025-07-30 21:19:56 +08:00
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
2024-10-20 00:43:25 +08:00
2023-12-24 17:57:48 +08:00
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:40
# @Desc : 微博爬虫 API 请求 client
import asyncio
2023-12-25 00:02:11 +08:00
import copy
2023-12-24 17:57:48 +08:00
import json
2023-12-25 00:02:11 +08:00
import re
2024-08-24 05:52:11 +08:00
from typing import Callable , Dict , List , Optional , Union
from urllib . parse import parse_qs , unquote , urlencode
2023-12-24 17:57:48 +08:00
import httpx
2024-08-24 05:52:11 +08:00
from httpx import Response
2023-12-24 17:57:48 +08:00
from playwright . async_api import BrowserContext , Page
2025-11-17 17:11:35 +08:00
from tenacity import retry , stop_after_attempt , wait_fixed
2024-04-17 23:13:40 +08:00
2024-08-05 00:48:42 +08:00
import config
2023-12-24 17:57:48 +08:00
from tools import utils
from . exception import DataFetchError
from . field import SearchType
class WeiboClient :
2025-07-30 21:19:56 +08:00
2023-12-24 17:57:48 +08:00
def __init__ (
2025-07-30 21:19:56 +08:00
self ,
2025-08-05 13:11:00 +08:00
timeout = 60 , # 若开启爬取媒体选项, weibo 的图片需要更久的超时时间
2025-07-31 22:48:02 +08:00
proxy = None ,
2025-07-30 21:19:56 +08:00
* ,
headers : Dict [ str , str ] ,
playwright_page : Page ,
cookie_dict : Dict [ str , str ] ,
2023-12-24 17:57:48 +08:00
) :
2025-07-31 22:48:02 +08:00
self . proxy = proxy
2023-12-24 17:57:48 +08:00
self . timeout = timeout
self . headers = headers
self . _host = " https://m.weibo.cn "
self . playwright_page = playwright_page
self . cookie_dict = cookie_dict
2024-04-09 17:21:52 +08:00
self . _image_agent_host = " https://i1.wp.com/ "
2023-12-24 17:57:48 +08:00
2025-11-17 17:24:47 +08:00
@retry ( stop = stop_after_attempt ( 5 ) , wait = wait_fixed ( 3 ) )
2024-08-24 05:52:11 +08:00
async def request ( self , method , url , * * kwargs ) - > Union [ Response , Dict ] :
enable_return_response = kwargs . pop ( " return_response " , False )
2025-07-31 22:48:02 +08:00
async with httpx . AsyncClient ( proxy = self . proxy ) as client :
2025-07-30 21:19:56 +08:00
response = await client . request ( method , url , timeout = self . timeout , * * kwargs )
2024-08-24 05:52:11 +08:00
if enable_return_response :
return response
2025-11-17 17:11:35 +08:00
try :
data : Dict = response . json ( )
except json . decoder . JSONDecodeError :
2025-11-17 17:24:47 +08:00
# issue: #771 搜索接口会报错432, 多次重试 + 更新 h5 cookies
2025-11-17 17:11:35 +08:00
utils . logger . error ( f " [WeiboClient.request] request { method } : { url } err code: { response . status_code } res: { response . text } " )
2025-11-17 17:24:47 +08:00
await self . playwright_page . goto ( self . _host )
await asyncio . sleep ( 2 )
await self . update_cookies ( browser_context = self . playwright_page . context )
2025-11-17 17:11:35 +08:00
raise DataFetchError ( f " get response code error: { response . status_code } " )
2024-08-24 05:52:11 +08:00
ok_code = data . get ( " ok " )
2025-01-10 19:20:01 +08:00
if ok_code == 0 : # response error
2023-12-24 17:57:48 +08:00
utils . logger . error ( f " [WeiboClient.request] request { method } : { url } err, res: { data } " )
2025-01-10 19:20:01 +08:00
raise DataFetchError ( data . get ( " msg " , " response error " ) )
elif ok_code != 1 : # unknown error
utils . logger . error ( f " [WeiboClient.request] request { method } : { url } err, res: { data } " )
raise DataFetchError ( data . get ( " msg " , " unknown error " ) )
else : # response right
2023-12-24 17:57:48 +08:00
return data . get ( " data " , { } )
2024-08-24 05:52:11 +08:00
async def get ( self , uri : str , params = None , headers = None , * * kwargs ) - > Union [ Response , Dict ] :
2023-12-24 17:57:48 +08:00
final_uri = uri
if isinstance ( params , dict ) :
final_uri = ( f " { uri } ? "
f " { urlencode ( params ) } " )
2023-12-25 00:02:11 +08:00
if headers is None :
headers = self . headers
2024-08-24 05:52:11 +08:00
return await self . request ( method = " GET " , url = f " { self . _host } { final_uri } " , headers = headers , * * kwargs )
2023-12-24 17:57:48 +08:00
async def post ( self , uri : str , data : dict ) - > Dict :
json_str = json . dumps ( data , separators = ( ' , ' , ' : ' ) , ensure_ascii = False )
2025-07-30 21:19:56 +08:00
return await self . request ( method = " POST " , url = f " { self . _host } { uri } " , data = json_str , headers = self . headers )
2023-12-24 17:57:48 +08:00
async def pong ( self ) - > bool :
""" get a note to check if login state is ok """
utils . logger . info ( " [WeiboClient.pong] Begin pong weibo... " )
ping_flag = False
try :
2024-08-05 00:48:42 +08:00
uri = " /api/config "
2023-12-30 18:54:21 +08:00
resp_data : Dict = await self . request ( method = " GET " , url = f " { self . _host } { uri } " , headers = self . headers )
if resp_data . get ( " login " ) :
ping_flag = True
2024-01-06 19:18:07 +08:00
else :
utils . logger . error ( f " [WeiboClient.pong] cookie may be invalid and again login... " )
2023-12-24 17:57:48 +08:00
except Exception as e :
2023-12-30 18:54:21 +08:00
utils . logger . error ( f " [WeiboClient.pong] Pong weibo failed: { e } , and try to login again... " )
2023-12-24 17:57:48 +08:00
ping_flag = False
return ping_flag
2025-11-17 17:11:35 +08:00
async def update_cookies ( self , browser_context : BrowserContext , urls : Optional [ List [ str ] ] = None ) :
"""
Update cookies from browser context
: param browser_context : Browser context
: param urls : Optional list of URLs to filter cookies ( e . g . , [ " https://m.weibo.cn " ] )
If provided , only cookies for these URLs will be retrieved
"""
if urls :
cookies = await browser_context . cookies ( urls = urls )
utils . logger . info ( f " [WeiboClient.update_cookies] Updating cookies for specific URLs: { urls } " )
else :
cookies = await browser_context . cookies ( )
utils . logger . info ( " [WeiboClient.update_cookies] Updating all cookies " )
cookie_str , cookie_dict = utils . convert_cookies ( cookies )
2023-12-24 17:57:48 +08:00
self . headers [ " Cookie " ] = cookie_str
self . cookie_dict = cookie_dict
2025-11-17 17:11:35 +08:00
utils . logger . info ( f " [WeiboClient.update_cookies] Cookie updated successfully, total: { len ( cookie_dict ) } cookies " )
2023-12-24 17:57:48 +08:00
async def get_note_by_keyword (
2025-07-30 21:19:56 +08:00
self ,
keyword : str ,
page : int = 1 ,
search_type : SearchType = SearchType . DEFAULT ,
2023-12-24 17:57:48 +08:00
) - > Dict :
"""
search note by keyword
: param keyword : 微博搜搜的关键词
: param page : 分页参数 - 当前页码
: param search_type : 搜索的类型 , 见 weibo / filed . py 中的枚举SearchType
: return :
"""
uri = " /api/container/getIndex "
containerid = f " 100103type= { search_type . value } &q= { keyword } "
params = {
" containerid " : containerid ,
" page_type " : " searchall " ,
" page " : page ,
}
return await self . get ( uri , params )
2023-12-25 00:02:11 +08:00
2025-01-10 19:20:01 +08:00
async def get_note_comments ( self , mid_id : str , max_id : int , max_id_type : int = 0 ) - > Dict :
2023-12-25 00:02:11 +08:00
""" get notes comments
: param mid_id : 微博ID
: param max_id : 分页参数ID
2025-01-10 19:20:01 +08:00
: param max_id_type : 分页参数ID类型
2023-12-25 00:02:11 +08:00
: return :
"""
uri = " /comments/hotflow "
params = {
" id " : mid_id ,
" mid " : mid_id ,
2025-01-10 19:20:01 +08:00
" max_id_type " : max_id_type ,
2023-12-25 00:02:11 +08:00
}
if max_id > 0 :
params . update ( { " max_id " : max_id } )
referer_url = f " https://m.weibo.cn/detail/ { mid_id } "
headers = copy . copy ( self . headers )
headers [ " Referer " ] = referer_url
return await self . get ( uri , params , headers = headers )
2025-01-10 19:20:01 +08:00
async def get_note_all_comments (
self ,
note_id : str ,
crawl_interval : float = 1.0 ,
callback : Optional [ Callable ] = None ,
max_count : int = 10 ,
) :
2023-12-25 00:02:11 +08:00
"""
get note all comments include sub comments
: param note_id :
: param crawl_interval :
: param callback :
2024-10-23 16:32:02 +08:00
: param max_count :
2023-12-25 00:02:11 +08:00
: return :
"""
result = [ ]
is_end = False
max_id = - 1
2025-01-10 19:20:01 +08:00
max_id_type = 0
2024-10-23 16:32:02 +08:00
while not is_end and len ( result ) < max_count :
2025-01-10 19:20:01 +08:00
comments_res = await self . get_note_comments ( note_id , max_id , max_id_type )
2023-12-25 00:02:11 +08:00
max_id : int = comments_res . get ( " max_id " )
2025-01-10 19:20:01 +08:00
max_id_type : int = comments_res . get ( " max_id_type " )
2023-12-25 00:02:11 +08:00
comment_list : List [ Dict ] = comments_res . get ( " data " , [ ] )
is_end = max_id == 0
2024-10-23 16:32:02 +08:00
if len ( result ) + len ( comment_list ) > max_count :
comment_list = comment_list [ : max_count - len ( result ) ]
2023-12-25 00:02:11 +08:00
if callback : # 如果有回调函数,就执行回调函数
await callback ( note_id , comment_list )
await asyncio . sleep ( crawl_interval )
2024-08-05 00:48:42 +08:00
result . extend ( comment_list )
sub_comment_result = await self . get_comments_all_sub_comments ( note_id , comment_list , callback )
result . extend ( sub_comment_result )
2023-12-25 00:02:11 +08:00
return result
2024-08-05 00:48:42 +08:00
@staticmethod
2025-07-30 21:19:56 +08:00
async def get_comments_all_sub_comments (
note_id : str ,
comment_list : List [ Dict ] ,
callback : Optional [ Callable ] = None ,
) - > List [ Dict ] :
2024-08-05 00:48:42 +08:00
"""
获取评论的所有子评论
Args :
note_id :
comment_list :
callback :
Returns :
"""
if not config . ENABLE_GET_SUB_COMMENTS :
2025-07-30 21:19:56 +08:00
utils . logger . info ( f " [WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled " )
2024-08-05 00:48:42 +08:00
return [ ]
res_sub_comments = [ ]
for comment in comment_list :
sub_comments = comment . get ( " comments " )
if sub_comments and isinstance ( sub_comments , list ) :
await callback ( note_id , sub_comments )
res_sub_comments . extend ( sub_comments )
return res_sub_comments
2023-12-25 00:02:11 +08:00
async def get_note_info_by_id ( self , note_id : str ) - > Dict :
"""
根据帖子ID获取详情
: param note_id :
: return :
"""
url = f " { self . _host } /detail/ { note_id } "
2025-07-31 22:48:02 +08:00
async with httpx . AsyncClient ( proxy = self . proxy ) as client :
2025-07-30 21:19:56 +08:00
response = await client . request ( " GET " , url , timeout = self . timeout , headers = self . headers )
2023-12-25 00:02:11 +08:00
if response . status_code != 200 :
raise DataFetchError ( f " get weibo detail err: { response . text } " )
match = re . search ( r ' var \ $render_data = ( \ [.*? \ ]) \ [0 \ ] ' , response . text , re . DOTALL )
if match :
render_data_json = match . group ( 1 )
render_data_dict = json . loads ( render_data_json )
note_detail = render_data_dict [ 0 ] . get ( " status " )
2025-07-30 21:19:56 +08:00
note_item = { " mblog " : note_detail }
2023-12-25 00:02:11 +08:00
return note_item
else :
utils . logger . info ( f " [WeiboClient.get_note_info_by_id] 未找到$render_data的值 " )
return dict ( )
2024-04-09 17:21:52 +08:00
async def get_note_image ( self , image_url : str ) - > bytes :
2024-08-05 00:48:42 +08:00
image_url = image_url [ 8 : ] # 去掉 https://
2024-04-09 17:21:52 +08:00
sub_url = image_url . split ( " / " )
image_url = " "
for i in range ( len ( sub_url ) ) :
if i == 1 :
2024-08-05 00:48:42 +08:00
image_url + = " large/ " # 都获取高清大图
2024-04-09 17:21:52 +08:00
elif i == len ( sub_url ) - 1 :
image_url + = sub_url [ i ]
else :
image_url + = sub_url [ i ] + " / "
# 微博图床对外存在防盗链,所以需要代理访问
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
2025-07-30 21:19:56 +08:00
final_uri = ( f " { self . _image_agent_host } "
f " { image_url } " )
2025-07-31 22:48:02 +08:00
async with httpx . AsyncClient ( proxy = self . proxy ) as client :
2025-08-05 13:11:00 +08:00
try :
response = await client . request ( " GET " , final_uri , timeout = self . timeout )
response . raise_for_status ( )
if not response . reason_phrase == " OK " :
utils . logger . error ( f " [WeiboClient.get_note_image] request { final_uri } err, res: { response . text } " )
return None
else :
return response . content
2025-08-06 11:24:51 +08:00
except httpx . HTTPError as exc : # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
utils . logger . error ( f " [DouYinClient.get_aweme_media] { exc . __class__ . __name__ } for { exc . request . url } - { exc } " ) # 保留原始异常类型名称,以便开发者调试
2024-04-09 17:21:52 +08:00
return None
2024-08-24 05:52:11 +08:00
async def get_creator_container_info ( self , creator_id : str ) - > Dict :
"""
获取用户的容器ID , 容器信息代表着真实请求的API路径
fid_container_id : 用户的微博详情API的容器ID
lfid_container_id : 用户的微博列表API的容器ID
Args :
creator_id :
Returns : {
"""
response = await self . get ( f " /u/ { creator_id } " , return_response = True )
m_weibocn_params = response . cookies . get ( " M_WEIBOCN_PARAMS " )
if not m_weibocn_params :
raise DataFetchError ( " get containerid failed " )
m_weibocn_params_dict = parse_qs ( unquote ( m_weibocn_params ) )
2025-07-30 21:19:56 +08:00
return { " fid_container_id " : m_weibocn_params_dict . get ( " fid " , [ " " ] ) [ 0 ] , " lfid_container_id " : m_weibocn_params_dict . get ( " lfid " , [ " " ] ) [ 0 ] }
2024-08-24 05:52:11 +08:00
async def get_creator_info_by_id ( self , creator_id : str ) - > Dict :
"""
根据用户ID获取用户详情
Args :
creator_id :
Returns :
"""
uri = " /api/container/getIndex "
2025-11-06 19:43:09 +08:00
containerid = f " 100505 { creator_id } "
2024-08-24 05:52:11 +08:00
params = {
" jumpfrom " : " weibocom " ,
" type " : " uid " ,
" value " : creator_id ,
2025-11-06 19:43:09 +08:00
" containerid " : containerid ,
2024-08-24 05:52:11 +08:00
}
user_res = await self . get ( uri , params )
return user_res
2025-07-30 21:19:56 +08:00
async def get_notes_by_creator (
self ,
creator : str ,
container_id : str ,
since_id : str = " 0 " ,
) - > Dict :
2024-08-24 05:52:11 +08:00
"""
获取博主的笔记
Args :
creator : 博主ID
container_id : 容器ID
since_id : 上一页最后一条笔记的ID
Returns :
"""
uri = " /api/container/getIndex "
params = {
" jumpfrom " : " weibocom " ,
" type " : " uid " ,
" value " : creator ,
" containerid " : container_id ,
" since_id " : since_id ,
}
return await self . get ( uri , params )
2025-07-30 21:19:56 +08:00
async def get_all_notes_by_creator_id (
self ,
creator_id : str ,
container_id : str ,
crawl_interval : float = 1.0 ,
callback : Optional [ Callable ] = None ,
) - > List [ Dict ] :
2024-08-24 05:52:11 +08:00
"""
获取指定用户下的所有发过的帖子 , 该方法会一直查找一个用户下的所有帖子信息
Args :
creator_id :
container_id :
crawl_interval :
callback :
Returns :
"""
result = [ ]
notes_has_more = True
since_id = " "
crawler_total_count = 0
while notes_has_more :
notes_res = await self . get_notes_by_creator ( creator_id , container_id , since_id )
if not notes_res :
2025-07-30 21:19:56 +08:00
utils . logger . error ( f " [WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data. " )
2024-08-24 05:52:11 +08:00
break
since_id = notes_res . get ( " cardlistInfo " , { } ) . get ( " since_id " , " 0 " )
if " cards " not in notes_res :
2025-07-30 21:19:56 +08:00
utils . logger . info ( f " [WeiboClient.get_all_notes_by_creator] No ' notes ' key found in response: { notes_res } " )
2024-08-24 05:52:11 +08:00
break
notes = notes_res [ " cards " ]
2025-07-30 21:19:56 +08:00
utils . logger . info ( f " [WeiboClient.get_all_notes_by_creator] got user_id: { creator_id } notes len : { len ( notes ) } " )
notes = [ note for note in notes if note . get ( " card_type " ) == 9 ]
2024-08-24 05:52:11 +08:00
if callback :
await callback ( notes )
await asyncio . sleep ( crawl_interval )
result . extend ( notes )
2024-11-13 22:28:54 +08:00
crawler_total_count + = 10
notes_has_more = notes_res . get ( " cardlistInfo " , { } ) . get ( " total " , 0 ) > crawler_total_count
2024-08-24 05:52:11 +08:00
return result