2025-07-18 23:26:52 +08:00
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
2024-10-20 00:43:25 +08:00
# 5. 不得用于任何非法或不当的用途。
2025-07-18 23:26:52 +08:00
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
2024-10-20 00:43:25 +08:00
2024-08-05 18:51:51 +08:00
import asyncio
import os
2025-09-02 16:43:09 +08:00
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
2024-08-05 18:51:51 +08:00
from asyncio import Task
from typing import Dict , List , Optional , Tuple
2025-07-18 23:26:52 +08:00
from playwright . async_api import (
BrowserContext ,
BrowserType ,
Page ,
Playwright ,
async_playwright ,
)
2024-08-05 18:51:51 +08:00
2025-07-18 23:26:52 +08:00
import config
2024-08-05 18:51:51 +08:00
from base . base_crawler import AbstractCrawler
2024-08-24 11:03:23 +08:00
from model . m_baidu_tieba import TiebaCreator , TiebaNote
2024-08-07 01:01:21 +08:00
from proxy . proxy_ip_pool import IpInfoModel , create_ip_pool
2024-08-05 18:51:51 +08:00
from store import tieba as tieba_store
from tools import utils
2025-07-03 17:13:32 +08:00
from tools . cdp_browser import CDPBrowserManager
2024-08-23 08:29:24 +08:00
from var import crawler_type_var , source_keyword_var
2024-08-05 18:51:51 +08:00
from . client import BaiduTieBaClient
from . field import SearchNoteType , SearchSortType
2025-01-02 20:29:05 +08:00
from . help import TieBaExtractor
2024-08-05 18:51:51 +08:00
from . login import BaiduTieBaLogin
class TieBaCrawler ( AbstractCrawler ) :
context_page : Page
tieba_client : BaiduTieBaClient
browser_context : BrowserContext
2025-07-03 17:13:32 +08:00
cdp_manager : Optional [ CDPBrowserManager ]
2024-08-05 18:51:51 +08:00
def __init__ ( self ) - > None :
self . index_url = " https://tieba.baidu.com "
self . user_agent = utils . get_user_agent ( )
2025-01-02 20:29:05 +08:00
self . _page_extractor = TieBaExtractor ( )
2025-07-03 17:13:32 +08:00
self . cdp_manager = None
2024-08-05 18:51:51 +08:00
async def start ( self ) - > None :
2024-08-06 03:37:55 +08:00
"""
Start the crawler
Returns :
"""
ip_proxy_pool , httpx_proxy_format = None , None
2024-08-05 18:51:51 +08:00
if config . ENABLE_IP_PROXY :
2025-07-18 23:26:52 +08:00
utils . logger . info (
" [BaiduTieBaCrawler.start] Begin create ip proxy pool ... "
)
ip_proxy_pool = await create_ip_pool (
config . IP_PROXY_POOL_COUNT , enable_validate_ip = True
)
2024-08-05 18:51:51 +08:00
ip_proxy_info : IpInfoModel = await ip_proxy_pool . get_proxy ( )
2025-07-29 14:16:24 +08:00
_ , httpx_proxy_format = utils . format_proxy_info ( ip_proxy_info )
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.start] Init default ip proxy, value: { httpx_proxy_format } "
)
2024-08-06 03:37:55 +08:00
# Create a client to interact with the baidutieba website.
self . tieba_client = BaiduTieBaClient (
ip_pool = ip_proxy_pool ,
default_ip_proxy = httpx_proxy_format ,
)
crawler_type_var . set ( config . CRAWLER_TYPE )
if config . CRAWLER_TYPE == " search " :
# Search for notes and retrieve their comment information.
await self . search ( )
2024-08-08 14:19:32 +08:00
await self . get_specified_tieba_notes ( )
2024-08-06 03:37:55 +08:00
elif config . CRAWLER_TYPE == " detail " :
# Get the information and comments of the specified post
await self . get_specified_notes ( )
2024-08-24 09:12:03 +08:00
elif config . CRAWLER_TYPE == " creator " :
# Get creator's information and their notes and comments
await self . get_creators_and_notes ( )
2024-08-06 03:37:55 +08:00
else :
pass
utils . logger . info ( " [BaiduTieBaCrawler.start] Tieba Crawler finished ... " )
2024-08-05 18:51:51 +08:00
async def search ( self ) - > None :
2024-08-06 03:37:55 +08:00
"""
Search for notes and retrieve their comment information .
Returns :
"""
2025-07-18 23:26:52 +08:00
utils . logger . info (
" [BaiduTieBaCrawler.search] Begin search baidu tieba keywords "
)
2024-08-05 18:51:51 +08:00
tieba_limit_count = 10 # tieba limit page fixed value
if config . CRAWLER_MAX_NOTES_COUNT < tieba_limit_count :
config . CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
start_page = config . START_PAGE
for keyword in config . KEYWORDS . split ( " , " ) :
2024-08-23 08:29:24 +08:00
source_keyword_var . set ( keyword )
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.search] Current search keyword: { keyword } "
)
2024-08-05 18:51:51 +08:00
page = 1
2025-07-18 23:26:52 +08:00
while (
page - start_page + 1
) * tieba_limit_count < = config . CRAWLER_MAX_NOTES_COUNT :
2024-08-05 18:51:51 +08:00
if page < start_page :
utils . logger . info ( f " [BaiduTieBaCrawler.search] Skip page { page } " )
page + = 1
continue
try :
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.search] search tieba keyword: { keyword } , page: { page } "
)
notes_list : List [ TiebaNote ] = (
await self . tieba_client . get_notes_by_keyword (
keyword = keyword ,
page = page ,
page_size = tieba_limit_count ,
sort = SearchSortType . TIME_DESC ,
note_type = SearchNoteType . FIXED_THREAD ,
)
2024-08-05 18:51:51 +08:00
)
2024-08-07 01:01:21 +08:00
if not notes_list :
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.search] Search note list is empty "
)
2024-08-05 18:51:51 +08:00
break
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.search] Note list len: { len ( notes_list ) } "
)
await self . get_specified_notes (
note_id_list = [ note_detail . note_id for note_detail in notes_list ]
)
2025-09-02 16:43:09 +08:00
# Sleep after page navigation
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [TieBaCrawler.search] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds after page { page } " )
2024-08-05 18:51:51 +08:00
page + = 1
except Exception as ex :
2024-08-07 01:01:21 +08:00
utils . logger . error (
2025-07-18 23:26:52 +08:00
f " [BaiduTieBaCrawler.search] Search keywords error, current page: { page } , current keyword: { keyword } , err: { ex } "
)
2024-08-05 18:51:51 +08:00
break
2024-08-08 14:19:32 +08:00
async def get_specified_tieba_notes ( self ) :
"""
Get the information and comments of the specified post by tieba name
Returns :
"""
tieba_limit_count = 50
if config . CRAWLER_MAX_NOTES_COUNT < tieba_limit_count :
config . CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
for tieba_name in config . TIEBA_NAME_LIST :
utils . logger . info (
2025-07-18 23:26:52 +08:00
f " [BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: { tieba_name } "
)
2024-08-08 14:19:32 +08:00
page_number = 0
while page_number < = config . CRAWLER_MAX_NOTES_COUNT :
2025-07-18 23:26:52 +08:00
note_list : List [ TiebaNote ] = (
await self . tieba_client . get_notes_by_tieba_name (
tieba_name = tieba_name , page_num = page_number
)
2024-08-08 14:19:32 +08:00
)
if not note_list :
utils . logger . info (
2025-07-18 23:26:52 +08:00
f " [BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty "
)
2024-08-08 14:19:32 +08:00
break
utils . logger . info (
2025-07-18 23:26:52 +08:00
f " [BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: { tieba_name } note list len: { len ( note_list ) } "
)
2024-08-08 14:19:32 +08:00
await self . get_specified_notes ( [ note . note_id for note in note_list ] )
2025-09-02 16:43:09 +08:00
# Sleep after processing notes
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [TieBaCrawler.get_specified_tieba_notes] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds after processing notes from page { page_number } " )
2024-08-08 14:19:32 +08:00
page_number + = tieba_limit_count
2025-07-18 23:26:52 +08:00
async def get_specified_notes (
self , note_id_list : List [ str ] = config . TIEBA_SPECIFIED_ID_LIST
) :
2024-08-05 18:51:51 +08:00
"""
2024-08-07 01:01:21 +08:00
Get the information and comments of the specified post
Args :
note_id_list :
2024-08-05 18:51:51 +08:00
2024-08-07 01:01:21 +08:00
Returns :
2024-08-05 18:51:51 +08:00
2024-08-07 01:01:21 +08:00
"""
2024-08-05 18:51:51 +08:00
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list = [
2025-07-18 23:26:52 +08:00
self . get_note_detail_async_task ( note_id = note_id , semaphore = semaphore )
for note_id in note_id_list
2024-08-05 18:51:51 +08:00
]
note_details = await asyncio . gather ( * task_list )
2024-08-07 02:34:56 +08:00
note_details_model : List [ TiebaNote ] = [ ]
2024-08-05 18:51:51 +08:00
for note_detail in note_details :
if note_detail is not None :
2024-08-07 02:34:56 +08:00
note_details_model . append ( note_detail )
2024-08-05 18:51:51 +08:00
await tieba_store . update_tieba_note ( note_detail )
2024-08-07 02:34:56 +08:00
await self . batch_get_note_comments ( note_details_model )
2024-08-05 18:51:51 +08:00
2025-07-18 23:26:52 +08:00
async def get_note_detail_async_task (
self , note_id : str , semaphore : asyncio . Semaphore
) - > Optional [ TiebaNote ] :
2024-08-07 01:01:21 +08:00
"""
Get note detail
Args :
note_id : baidu tieba note id
semaphore : asyncio semaphore
Returns :
"""
2024-08-05 18:51:51 +08:00
async with semaphore :
try :
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: { note_id } "
)
2024-08-07 01:01:21 +08:00
note_detail : TiebaNote = await self . tieba_client . get_note_by_id ( note_id )
2025-09-02 16:43:09 +08:00
# Sleep after fetching note details
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [TieBaCrawler.get_note_detail_async_task] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds after fetching note details { note_id } " )
2024-08-05 18:51:51 +08:00
if not note_detail :
utils . logger . error (
2025-07-18 23:26:52 +08:00
f " [BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: { note_id } "
)
2024-08-05 18:51:51 +08:00
return None
return note_detail
except Exception as ex :
2025-07-18 23:26:52 +08:00
utils . logger . error (
f " [BaiduTieBaCrawler.get_note_detail] Get note detail error: { ex } "
)
2024-08-05 18:51:51 +08:00
return None
except KeyError as ex :
utils . logger . error (
2025-07-18 23:26:52 +08:00
f " [BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id: { note_id } , err: { ex } "
)
2024-08-05 18:51:51 +08:00
return None
2024-08-07 02:34:56 +08:00
async def batch_get_note_comments ( self , note_detail_list : List [ TiebaNote ] ) :
2024-08-07 01:01:21 +08:00
"""
Batch get note comments
Args :
2024-08-07 02:34:56 +08:00
note_detail_list :
2024-08-07 01:01:21 +08:00
Returns :
"""
2024-08-05 18:51:51 +08:00
if not config . ENABLE_GET_COMMENTS :
return
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list : List [ Task ] = [ ]
2024-08-07 02:34:56 +08:00
for note_detail in note_detail_list :
2025-07-18 23:26:52 +08:00
task = asyncio . create_task (
self . get_comments_async_task ( note_detail , semaphore ) ,
name = note_detail . note_id ,
)
2024-08-05 18:51:51 +08:00
task_list . append ( task )
await asyncio . gather ( * task_list )
2025-07-18 23:26:52 +08:00
async def get_comments_async_task (
self , note_detail : TiebaNote , semaphore : asyncio . Semaphore
) :
2024-08-07 01:01:21 +08:00
"""
Get comments async task
Args :
2024-08-07 02:34:56 +08:00
note_detail :
2024-08-07 01:01:21 +08:00
semaphore :
Returns :
"""
2024-08-05 18:51:51 +08:00
async with semaphore :
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [BaiduTieBaCrawler.get_comments] Begin get note id comments { note_detail . note_id } "
)
2025-09-02 16:43:09 +08:00
# Sleep before fetching comments
await asyncio . sleep ( config . CRAWLER_MAX_SLEEP_SEC )
utils . logger . info ( f " [TieBaCrawler.get_comments_async_task] Sleeping for { config . CRAWLER_MAX_SLEEP_SEC } seconds before fetching comments for note { note_detail . note_id } " )
2024-08-05 18:51:51 +08:00
await self . tieba_client . get_note_all_comments (
2024-08-07 02:34:56 +08:00
note_detail = note_detail ,
2025-09-02 16:43:09 +08:00
crawl_interval = config . CRAWLER_MAX_SLEEP_SEC ,
2024-10-23 16:32:02 +08:00
callback = tieba_store . batch_update_tieba_note_comments ,
2025-07-18 23:26:52 +08:00
max_count = config . CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ,
2024-08-05 18:51:51 +08:00
)
2024-08-24 09:12:03 +08:00
async def get_creators_and_notes ( self ) - > None :
"""
Get creator ' s information and their notes and comments
Returns :
"""
2025-07-18 23:26:52 +08:00
utils . logger . info (
" [WeiboCrawler.get_creators_and_notes] Begin get weibo creators "
)
2024-08-24 09:12:03 +08:00
for creator_url in config . TIEBA_CREATOR_URL_LIST :
2025-07-18 23:26:52 +08:00
creator_page_html_content = await self . tieba_client . get_creator_info_by_url (
creator_url = creator_url
)
creator_info : TiebaCreator = self . _page_extractor . extract_creator_info (
creator_page_html_content
)
2024-08-24 11:03:23 +08:00
if creator_info :
2025-07-18 23:26:52 +08:00
utils . logger . info (
f " [WeiboCrawler.get_creators_and_notes] creator info: { creator_info } "
)
2024-08-24 11:03:23 +08:00
if not creator_info :
2024-08-24 09:12:03 +08:00
raise Exception ( " Get creator info error " )
2024-08-24 11:03:23 +08:00
await tieba_store . save_creator ( user_info = creator_info )
2024-08-24 09:12:03 +08:00
# Get all note information of the creator
2025-07-18 23:26:52 +08:00
all_notes_list = (
await self . tieba_client . get_all_notes_by_creator_user_name (
user_name = creator_info . user_name ,
crawl_interval = 0 ,
callback = tieba_store . batch_update_tieba_notes ,
max_note_count = config . CRAWLER_MAX_NOTES_COUNT ,
creator_page_html_content = creator_page_html_content ,
)
2024-08-24 09:12:03 +08:00
)
await self . batch_get_note_comments ( all_notes_list )
else :
utils . logger . error (
2025-07-18 23:26:52 +08:00
f " [WeiboCrawler.get_creators_and_notes] get creator info error, creator_url: { creator_url } "
)
2024-08-24 09:12:03 +08:00
2024-08-05 18:51:51 +08:00
async def launch_browser (
2025-07-18 23:26:52 +08:00
self ,
chromium : BrowserType ,
playwright_proxy : Optional [ Dict ] ,
user_agent : Optional [ str ] ,
headless : bool = True ,
2024-08-05 18:51:51 +08:00
) - > BrowserContext :
2024-08-07 01:01:21 +08:00
"""
Launch browser and create browser
Args :
chromium :
playwright_proxy :
user_agent :
headless :
Returns :
"""
2025-07-18 23:26:52 +08:00
utils . logger . info (
" [BaiduTieBaCrawler.launch_browser] Begin create browser context ... "
)
2024-08-05 18:51:51 +08:00
if config . SAVE_LOGIN_STATE :
# feat issue #14
# we will save login state to avoid login every time
2025-07-18 23:26:52 +08:00
user_data_dir = os . path . join (
os . getcwd ( ) , " browser_data " , config . USER_DATA_DIR % config . PLATFORM
) # type: ignore
2024-08-05 18:51:51 +08:00
browser_context = await chromium . launch_persistent_context (
user_data_dir = user_data_dir ,
accept_downloads = True ,
headless = headless ,
proxy = playwright_proxy , # type: ignore
viewport = { " width " : 1920 , " height " : 1080 } ,
2025-07-18 23:26:52 +08:00
user_agent = user_agent ,
2024-08-05 18:51:51 +08:00
)
return browser_context
else :
browser = await chromium . launch ( headless = headless , proxy = playwright_proxy ) # type: ignore
browser_context = await browser . new_context (
2025-07-18 23:26:52 +08:00
viewport = { " width " : 1920 , " height " : 1080 } , user_agent = user_agent
2024-08-05 18:51:51 +08:00
)
return browser_context
2025-07-18 23:26:52 +08:00
async def launch_browser_with_cdp (
self ,
playwright : Playwright ,
playwright_proxy : Optional [ Dict ] ,
user_agent : Optional [ str ] ,
headless : bool = True ,
) - > BrowserContext :
2025-07-03 17:13:32 +08:00
"""
使用CDP模式启动浏览器
"""
try :
self . cdp_manager = CDPBrowserManager ( )
browser_context = await self . cdp_manager . launch_and_connect (
playwright = playwright ,
playwright_proxy = playwright_proxy ,
user_agent = user_agent ,
2025-07-18 23:26:52 +08:00
headless = headless ,
2025-07-03 17:13:32 +08:00
)
# 显示浏览器信息
browser_info = await self . cdp_manager . get_browser_info ( )
utils . logger . info ( f " [TieBaCrawler] CDP浏览器信息: { browser_info } " )
return browser_context
except Exception as e :
utils . logger . error ( f " [TieBaCrawler] CDP模式启动失败, 回退到标准模式: { e } " )
# 回退到标准模式
chromium = playwright . chromium
2025-07-18 23:26:52 +08:00
return await self . launch_browser (
chromium , playwright_proxy , user_agent , headless
)
2025-07-03 17:13:32 +08:00
2024-08-05 18:51:51 +08:00
async def close ( self ) :
2024-08-07 01:01:21 +08:00
"""
Close browser context
Returns :
"""
2025-07-03 17:13:32 +08:00
# 如果使用CDP模式, 需要特殊处理
if self . cdp_manager :
await self . cdp_manager . cleanup ( )
self . cdp_manager = None
else :
await self . browser_context . close ( )
2024-08-05 18:51:51 +08:00
utils . logger . info ( " [BaiduTieBaCrawler.close] Browser context closed ... " )