2025-11-18 12:24:02 +08:00
# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/kuaishou/core.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
2025-03-11 23:42:34 +08:00
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
2024-10-20 00:43:25 +08:00
# 5. 不得用于任何非法或不当的用途。
2025-03-11 23:42:34 +08:00
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
2024-10-20 00:43:25 +08:00
2023-11-24 00:04:33 +08:00
import asyncio
import os
2025-09-02 16:43:09 +08:00
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
2023-11-26 21:43:39 +08:00
import time
from asyncio import Task
2023-11-24 00:04:33 +08:00
from typing import Dict , List , Optional , Tuple
2025-07-18 23:26:52 +08:00
from playwright . async_api import (
BrowserContext ,
BrowserType ,
Page ,
Playwright ,
async_playwright ,
)
import config
2023-11-24 00:04:33 +08:00
from base . base_crawler import AbstractCrawler
2025-10-18 07:40:10 +08:00
from model . m_kuaishou import VideoUrlInfo , CreatorUrlInfo
2023-12-09 21:10:01 +08:00
from proxy . proxy_ip_pool import IpInfoModel , create_ip_pool
2024-01-14 22:06:31 +08:00
from store import kuaishou as kuaishou_store
2023-11-26 21:43:39 +08:00
from tools import utils
2025-07-03 17:13:32 +08:00
from tools . cdp_browser import CDPBrowserManager
2024-08-23 08:29:24 +08:00
from var import comment_tasks_var , crawler_type_var , source_keyword_var
2023-11-24 00:04:33 +08:00
2023-11-25 00:02:33 +08:00
from . client import KuaiShouClient
from . exception import DataFetchError
2025-10-18 07:40:10 +08:00
from . help import parse_video_info_from_url , parse_creator_info_from_url
2023-11-26 21:43:39 +08:00
from . login import KuaishouLogin
2023-11-24 00:04:33 +08:00
class KuaishouCrawler ( AbstractCrawler ) :
context_page : Page
2023-11-25 00:02:33 +08:00
ks_client : KuaiShouClient
2023-11-24 00:04:33 +08:00
browser_context : BrowserContext
2025-07-03 17:13:32 +08:00
cdp_manager : Optional [ CDPBrowserManager ]
2023-11-24 00:04:33 +08:00
def __init__ ( self ) :
self . index_url = " https://www.kuaishou.com "
self . user_agent = utils . get_user_agent ( )
2025-07-03 17:13:32 +08:00
self . cdp_manager = None
2023-11-24 00:04:33 +08:00
async def start ( self ) :
2023-12-08 00:10:04 +08:00
playwright_proxy_format , httpx_proxy_format = None , None
if config . ENABLE_IP_PROXY :
2025-03-11 23:42:34 +08:00
ip_proxy_pool = await create_ip_pool (
config . IP_PROXY_POOL_COUNT , enable_validate_ip = True
)
2023-12-08 00:10:04 +08:00
ip_proxy_info : IpInfoModel = await ip_proxy_pool . get_proxy ( )
2025-07-29 14:16:24 +08:00
playwright_proxy_format , httpx_proxy_format = utils . format_proxy_info (
2025-03-11 23:42:34 +08:00
ip_proxy_info
)
2023-12-08 00:10:04 +08:00
2023-11-24 00:04:33 +08:00
async with async_playwright ( ) as playwright :
2025-07-03 17:13:32 +08:00
# 根据配置选择启动模式
if config . ENABLE_CDP_MODE :
utils . logger . info ( " [KuaishouCrawler] 使用CDP模式启动浏览器 " )
self . browser_context = await self . launch_browser_with_cdp (
2025-07-18 23:26:52 +08:00
playwright ,
playwright_proxy_format ,
self . user_agent ,
headless = config . CDP_HEADLESS ,
2025-07-03 17:13:32 +08:00
)
else :
utils . logger . info ( " [KuaishouCrawler] 使用标准模式启动浏览器 " )
# Launch a browser context.
chromium = playwright . chromium
self . browser_context = await self . launch_browser (
chromium , None , self . user_agent , headless = config . HEADLESS
)
2025-10-19 15:32:03 +08:00
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self . browser_context . add_init_script ( path = " libs/stealth.min.js " )
2023-11-24 00:04:33 +08:00
self . context_page = await self . browser_context . new_page ( )
await self . context_page . goto ( f " { self . index_url } ?isHome=1 " )
# Create a client to interact with the kuaishou website.
2023-12-08 00:10:04 +08:00
self . ks_client = await self . create_ks_client ( httpx_proxy_format )
2023-11-24 00:04:33 +08:00
if not await self . ks_client . pong ( ) :
login_obj = KuaishouLogin (
2024-06-09 09:35:52 +08:00
login_type = config . LOGIN_TYPE ,
2023-12-08 00:10:04 +08:00
login_phone = httpx_proxy_format ,
2023-11-24 00:04:33 +08:00
browser_context = self . browser_context ,
context_page = self . context_page ,
2025-03-11 23:42:34 +08:00
cookie_str = config . COOKIES ,
2023-11-24 00:04:33 +08:00
)
await login_obj . begin ( )
2025-03-11 23:42:34 +08:00
await self . ks_client . update_cookies (
browser_context = self . browser_context
)
2023-11-24 00:04:33 +08:00
2024-06-09 09:35:52 +08:00
crawler_type_var . set ( config . CRAWLER_TYPE )
if config . CRAWLER_TYPE == " search " :
2024-06-13 14:49:07 +08:00
# Search for videos and retrieve their comment information.
2023-11-24 00:04:33 +08:00
await self . search ( )
2024-06-09 09:35:52 +08:00
elif config . CRAWLER_TYPE == " detail " :
2023-11-24 00:04:33 +08:00
# Get the information and comments of the specified post
2023-11-27 23:07:04 +08:00
await self . get_specified_videos ( )
2024-06-13 14:49:07 +08:00
elif config . CRAWLER_TYPE == " creator " :
# Get creator's information and their videos and comments
await self . get_creators_and_videos ( )
2023-11-24 00:04:33 +08:00
else :
pass
2023-12-23 01:04:08 +08:00
utils . logger . info ( " [KuaishouCrawler.start] Kuaishou Crawler finished ... " )
2023-11-24 00:04:33 +08:00
async def search ( self ) :
2023-12-23 01:04:08 +08:00
utils . logger . info ( " [KuaishouCrawler.search] Begin search kuaishou keywords " )
2024-04-04 00:11:22 +08:00
ks_limit_count = 20 # kuaishou limit page fixed value
if config . CRAWLER_MAX_NOTES_COUNT < ks_limit_count :
config . CRAWLER_MAX_NOTES_COUNT = ks_limit_count
2024-06-09 09:35:52 +08:00
start_page = config . START_PAGE
for keyword in config . KEYWORDS . split ( " , " ) :
2025-03-11 23:42:34 +08:00
search_session_id = " "
2024-08-23 08:29:24 +08:00
source_keyword_var . set ( keyword )
2025-03-11 23:42:34 +08:00
utils . logger . info (
f " [KuaishouCrawler.search] Current search keyword: { keyword } "
)
2023-11-25 00:02:33 +08:00
page = 1
2025-03-11 23:42:34 +08:00
while (
page - start_page + 1
) * ks_limit_count < = config . CRAWLER_MAX_NOTES_COUNT :
2024-04-12 00:52:47 +08:00
if page < start_page :
utils . logger . info ( f " [KuaishouCrawler.search] Skip page: { page } " )
page + = 1
continue
2025-03-11 23:42:34 +08:00
utils . logger . info (
f " [KuaishouCrawler.search] search kuaishou keyword: { keyword } , page: { page } "
)
2023-11-25 00:02:33 +08:00
video_id_list : List [ str ] = [ ]
videos_res = await self . ks_client . search_info_by_keyword (
keyword = keyword ,
pcursor = str ( page ) ,
2025-03-11 23:42:34 +08:00
search_session_id = search_session_id ,
2023-11-25 00:02:33 +08:00
)
if not videos_res :
2025-03-11 23:42:34 +08:00
utils . logger . error (
f " [KuaishouCrawler.search] search info by keyword: { keyword } not found data "
)
2023-11-25 00:02:33 +08:00
continue
2023-11-24 00:04:33 +08:00
2023-11-26 01:05:52 +08:00
vision_search_photo : Dict = videos_res . get ( " visionSearchPhoto " )
if vision_search_photo . get ( " result " ) != 1 :
2025-03-11 23:42:34 +08:00
utils . logger . error (
f " [KuaishouCrawler.search] search info by keyword: { keyword } not found data "
)
2023-11-26 01:05:52 +08:00
continue
2025-03-11 23:42:34 +08:00
search_session_id = vision_search_photo . get ( " searchSessionId " , " " )
2023-11-26 01:05:52 +08:00
for video_detail in vision_search_photo . get ( " feeds " ) :
video_id_list . append ( video_detail . get ( " photo " , { } ) . get ( " id " ) )
2024-01-14 22:06:31 +08:00
await kuaishou_store . update_kuaishou_video ( video_item = video_detail )
2023-11-24 00:04:33 +08:00
2023-11-26 01:05:52 +08:00
# batch fetch video comments
page + = 1
2025-11-18 12:24:02 +08:00
2025-09-02 16:43:09 +08:00
# Sleep after page navigation
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [KuaishouCrawler.search] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds after page { page - 1 } " )
2025-11-18 12:24:02 +08:00
2023-11-26 01:05:52 +08:00
await self . batch_get_video_comments ( video_id_list )
2023-11-25 00:02:33 +08:00
2023-11-27 23:07:04 +08:00
async def get_specified_videos ( self ) :
""" Get the information and comments of the specified post """
2025-10-18 07:40:10 +08:00
utils . logger . info ( " [KuaishouCrawler.get_specified_videos] Parsing video URLs... " )
video_ids = [ ]
for video_url in config . KS_SPECIFIED_ID_LIST :
try :
video_info = parse_video_info_from_url ( video_url )
video_ids . append ( video_info . video_id )
utils . logger . info ( f " Parsed video ID: { video_info . video_id } from { video_url } " )
except ValueError as e :
utils . logger . error ( f " Failed to parse video URL: { e } " )
continue
2023-11-27 23:07:04 +08:00
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list = [
2025-03-11 23:42:34 +08:00
self . get_video_info_task ( video_id = video_id , semaphore = semaphore )
2025-10-18 07:40:10 +08:00
for video_id in video_ids
2023-11-27 23:07:04 +08:00
]
video_details = await asyncio . gather ( * task_list )
for video_detail in video_details :
if video_detail is not None :
2024-01-14 22:06:31 +08:00
await kuaishou_store . update_kuaishou_video ( video_detail )
2025-10-18 07:40:10 +08:00
await self . batch_get_video_comments ( video_ids )
2023-11-25 00:02:33 +08:00
2025-03-11 23:42:34 +08:00
async def get_video_info_task (
self , video_id : str , semaphore : asyncio . Semaphore
) - > Optional [ Dict ] :
2023-11-26 01:05:52 +08:00
""" Get video detail task """
2023-11-25 00:02:33 +08:00
async with semaphore :
try :
2023-11-26 01:05:52 +08:00
result = await self . ks_client . get_video_info ( video_id )
2025-11-18 12:24:02 +08:00
2025-09-02 16:43:09 +08:00
# Sleep after fetching video details
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [KuaishouCrawler.get_video_info_task] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds after fetching video details { video_id } " )
2025-11-18 12:24:02 +08:00
2025-03-11 23:42:34 +08:00
utils . logger . info (
f " [KuaishouCrawler.get_video_info_task] Get video_id: { video_id } info result: { result } ... "
)
2023-11-27 23:07:04 +08:00
return result . get ( " visionVideoDetail " )
2023-11-25 00:02:33 +08:00
except DataFetchError as ex :
2025-03-11 23:42:34 +08:00
utils . logger . error (
f " [KuaishouCrawler.get_video_info_task] Get video detail error: { ex } "
)
2023-11-25 00:02:33 +08:00
return None
except KeyError as ex :
2025-03-11 23:42:34 +08:00
utils . logger . error (
f " [KuaishouCrawler.get_video_info_task] have not fund video detail video_id: { video_id } , err: { ex } "
)
2023-11-25 00:02:33 +08:00
return None
2023-11-26 21:43:39 +08:00
async def batch_get_video_comments ( self , video_id_list : List [ str ] ) :
"""
batch get video comments
: param video_id_list :
: return :
"""
2024-03-16 11:52:42 +08:00
if not config . ENABLE_GET_COMMENTS :
2025-03-11 23:42:34 +08:00
utils . logger . info (
f " [KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled "
)
2024-03-16 11:52:42 +08:00
return
2025-03-11 23:42:34 +08:00
utils . logger . info (
f " [KuaishouCrawler.batch_get_video_comments] video ids: { video_id_list } "
)
2023-11-26 21:43:39 +08:00
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list : List [ Task ] = [ ]
for video_id in video_id_list :
2025-03-11 23:42:34 +08:00
task = asyncio . create_task (
self . get_comments ( video_id , semaphore ) , name = video_id
)
2023-11-26 21:43:39 +08:00
task_list . append ( task )
comment_tasks_var . set ( task_list )
await asyncio . gather ( * task_list )
async def get_comments ( self , video_id : str , semaphore : asyncio . Semaphore ) :
"""
get comment for video id
: param video_id :
: param semaphore :
: return :
"""
async with semaphore :
try :
2025-03-11 23:42:34 +08:00
utils . logger . info (
f " [KuaishouCrawler.get_comments] begin get video_id: { video_id } comments ... "
)
2025-11-18 12:24:02 +08:00
2025-09-02 16:43:09 +08:00
# Sleep before fetching comments
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [KuaishouCrawler.get_comments] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds before fetching comments for video { video_id } " )
2025-11-18 12:24:02 +08:00
2023-11-26 21:43:39 +08:00
await self . ks_client . get_video_all_comments (
photo_id = video_id ,
2025-09-02 16:43:09 +08:00
crawl_interval = config . CRAWLER_MAX_SLEEP_SEC ,
2024-10-23 16:32:02 +08:00
callback = kuaishou_store . batch_update_ks_video_comments ,
2025-03-11 23:42:34 +08:00
max_count = config . CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ,
2023-11-26 21:43:39 +08:00
)
except DataFetchError as ex :
2025-03-11 23:42:34 +08:00
utils . logger . error (
f " [KuaishouCrawler.get_comments] get video_id: { video_id } comment error: { ex } "
)
2023-11-26 21:43:39 +08:00
except Exception as e :
2025-03-11 23:42:34 +08:00
utils . logger . error (
f " [KuaishouCrawler.get_comments] may be been blocked, err: { e } "
)
2023-11-26 21:43:39 +08:00
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
# maybe kuaishou block our request, we will take a nap and update the cookie again
current_running_tasks = comment_tasks_var . get ( )
for task in current_running_tasks :
task . cancel ( )
time . sleep ( 20 )
await self . context_page . goto ( f " { self . index_url } ?isHome=1 " )
2025-03-11 23:42:34 +08:00
await self . ks_client . update_cookies (
browser_context = self . browser_context
)
2023-11-26 21:43:39 +08:00
2023-11-25 00:02:33 +08:00
async def create_ks_client ( self , httpx_proxy : Optional [ str ] ) - > KuaiShouClient :
2024-06-13 14:49:07 +08:00
""" Create ks client """
2025-03-11 23:42:34 +08:00
utils . logger . info (
" [KuaishouCrawler.create_ks_client] Begin create kuaishou API client ... "
)
cookie_str , cookie_dict = utils . convert_cookies (
await self . browser_context . cookies ( )
)
2024-06-13 14:49:07 +08:00
ks_client_obj = KuaiShouClient (
2025-07-31 22:48:02 +08:00
proxy = httpx_proxy ,
2023-11-24 00:04:33 +08:00
headers = {
" User-Agent " : self . user_agent ,
" Cookie " : cookie_str ,
" Origin " : self . index_url ,
" Referer " : self . index_url ,
2025-03-11 23:42:34 +08:00
" Content-Type " : " application/json;charset=UTF-8 " ,
2023-11-24 00:04:33 +08:00
} ,
playwright_page = self . context_page ,
cookie_dict = cookie_dict ,
)
2024-06-13 14:49:07 +08:00
return ks_client_obj
2023-11-24 00:04:33 +08:00
async def launch_browser (
2025-03-11 23:42:34 +08:00
self ,
chromium : BrowserType ,
playwright_proxy : Optional [ Dict ] ,
user_agent : Optional [ str ] ,
headless : bool = True ,
2023-11-24 00:04:33 +08:00
) - > BrowserContext :
""" Launch browser and create browser context """
2025-03-11 23:42:34 +08:00
utils . logger . info (
" [KuaishouCrawler.launch_browser] Begin create browser context ... "
)
2023-11-24 00:04:33 +08:00
if config . SAVE_LOGIN_STATE :
2025-03-11 23:42:34 +08:00
user_data_dir = os . path . join (
os . getcwd ( ) , " browser_data " , config . USER_DATA_DIR % config . PLATFORM
) # type: ignore
2023-11-24 00:04:33 +08:00
browser_context = await chromium . launch_persistent_context (
user_data_dir = user_data_dir ,
accept_downloads = True ,
headless = headless ,
proxy = playwright_proxy , # type: ignore
viewport = { " width " : 1920 , " height " : 1080 } ,
2025-03-11 23:42:34 +08:00
user_agent = user_agent ,
2025-11-06 10:40:30 +08:00
channel = " chrome " , # 使用系统的Chrome稳定版
2023-11-24 00:04:33 +08:00
)
return browser_context
else :
2025-11-06 10:40:30 +08:00
browser = await chromium . launch ( headless = headless , proxy = playwright_proxy , channel = " chrome " ) # type: ignore
2023-11-24 00:04:33 +08:00
browser_context = await browser . new_context (
2025-03-11 23:42:34 +08:00
viewport = { " width " : 1920 , " height " : 1080 } , user_agent = user_agent
2023-11-24 00:04:33 +08:00
)
return browser_context
2025-07-18 23:26:52 +08:00
async def launch_browser_with_cdp (
self ,
playwright : Playwright ,
playwright_proxy : Optional [ Dict ] ,
user_agent : Optional [ str ] ,
headless : bool = True ,
) - > BrowserContext :
2025-07-03 17:13:32 +08:00
"""
使用CDP模式启动浏览器
"""
try :
self . cdp_manager = CDPBrowserManager ( )
browser_context = await self . cdp_manager . launch_and_connect (
playwright = playwright ,
playwright_proxy = playwright_proxy ,
user_agent = user_agent ,
2025-07-18 23:26:52 +08:00
headless = headless ,
2025-07-03 17:13:32 +08:00
)
# 显示浏览器信息
browser_info = await self . cdp_manager . get_browser_info ( )
utils . logger . info ( f " [KuaishouCrawler] CDP浏览器信息: { browser_info } " )
return browser_context
except Exception as e :
2025-07-18 23:26:52 +08:00
utils . logger . error (
f " [KuaishouCrawler] CDP模式启动失败, 回退到标准模式: { e } "
)
2025-07-03 17:13:32 +08:00
# 回退到标准模式
chromium = playwright . chromium
2025-07-18 23:26:52 +08:00
return await self . launch_browser (
chromium , playwright_proxy , user_agent , headless
)
2025-07-03 17:13:32 +08:00
2024-06-13 14:49:07 +08:00
async def get_creators_and_videos ( self ) - > None :
""" Get creator ' s videos and retrieve their comment information. """
2025-03-11 23:42:34 +08:00
utils . logger . info (
" [KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators "
)
2025-10-18 07:40:10 +08:00
for creator_url in config . KS_CREATOR_ID_LIST :
try :
# Parse creator URL to get user_id
creator_info : CreatorUrlInfo = parse_creator_info_from_url ( creator_url )
utils . logger . info ( f " [KuaiShouCrawler.get_creators_and_videos] Parse creator URL info: { creator_info } " )
user_id = creator_info . user_id
# get creator detail info from web html content
createor_info : Dict = await self . ks_client . get_creator_info ( user_id = user_id )
if createor_info :
await kuaishou_store . save_creator ( user_id , creator = createor_info )
except ValueError as e :
utils . logger . error ( f " [KuaiShouCrawler.get_creators_and_videos] Failed to parse creator URL: { e } " )
continue
2024-06-13 14:49:07 +08:00
# Get all video information of the creator
all_video_list = await self . ks_client . get_all_videos_by_creator (
2025-03-11 23:42:34 +08:00
user_id = user_id ,
2025-09-26 18:10:30 +08:00
crawl_interval = config . CRAWLER_MAX_SLEEP_SEC ,
2025-03-11 23:42:34 +08:00
callback = self . fetch_creator_video_detail ,
2024-06-13 14:49:07 +08:00
)
2025-03-11 23:42:34 +08:00
video_ids = [
video_item . get ( " photo " , { } ) . get ( " id " ) for video_item in all_video_list
]
2024-06-13 14:49:07 +08:00
await self . batch_get_video_comments ( video_ids )
async def fetch_creator_video_detail ( self , video_list : List [ Dict ] ) :
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list = [
2025-03-11 23:42:34 +08:00
self . get_video_info_task ( post_item . get ( " photo " , { } ) . get ( " id " ) , semaphore )
for post_item in video_list
2024-06-13 14:49:07 +08:00
]
video_details = await asyncio . gather ( * task_list )
for video_detail in video_details :
if video_detail is not None :
await kuaishou_store . update_kuaishou_video ( video_detail )
2023-11-24 00:04:33 +08:00
async def close ( self ) :
""" Close browser context """
2025-07-03 17:13:32 +08:00
# 如果使用CDP模式, 需要特殊处理
if self . cdp_manager :
await self . cdp_manager . cleanup ( )
self . cdp_manager = None
else :
await self . browser_context . close ( )
2023-12-23 01:04:08 +08:00
utils . logger . info ( " [KuaishouCrawler.close] Browser context closed ... " )