2024-11-26 13:37:53 +08:00
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
2024-10-20 00:43:25 +08:00
# 5. 不得用于任何非法或不当的用途。
2024-11-26 13:37:53 +08:00
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
2024-10-20 00:43:25 +08:00
2023-07-29 15:35:40 +08:00
import asyncio
2023-07-15 21:30:12 +08:00
import os
2023-06-16 19:35:43 +08:00
import random
2024-12-05 21:10:31 +08:00
import time
2023-06-16 19:35:43 +08:00
from asyncio import Task
2023-07-29 15:35:40 +08:00
from typing import Dict , List , Optional , Tuple
2023-06-09 20:41:53 +08:00
2024-11-26 13:37:53 +08:00
from playwright . async_api import BrowserContext , BrowserType , Page , async_playwright
2024-08-24 06:07:33 +08:00
from tenacity import RetryError
2023-06-09 20:41:53 +08:00
2023-06-16 19:35:43 +08:00
import config
2023-06-27 23:38:30 +08:00
from base . base_crawler import AbstractCrawler
2024-10-22 20:33:10 +08:00
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
2024-10-20 00:59:08 +08:00
from model . m_xiaohongshu import NoteUrlInfo
2023-12-09 21:10:01 +08:00
from proxy . proxy_ip_pool import IpInfoModel , create_ip_pool
2024-01-14 22:06:31 +08:00
from store import xhs as xhs_store
2023-07-29 15:35:40 +08:00
from tools import utils
2024-08-23 08:29:24 +08:00
from var import crawler_type_var , source_keyword_var
2023-07-29 15:35:40 +08:00
2024-03-30 21:17:33 +08:00
from . client import XiaoHongShuClient
2023-07-29 15:35:40 +08:00
from . exception import DataFetchError
2024-03-02 01:49:42 +08:00
from . field import SearchSortType
2024-10-24 13:57:27 +08:00
from . help import parse_note_info_from_note_url , get_search_id
2024-03-30 21:17:33 +08:00
from . login import XiaoHongShuLogin
2023-06-09 20:41:53 +08:00
2023-06-27 23:38:30 +08:00
class XiaoHongShuCrawler ( AbstractCrawler ) :
2023-07-16 17:57:18 +08:00
context_page : Page
2024-03-30 21:17:33 +08:00
xhs_client : XiaoHongShuClient
2023-07-29 15:35:40 +08:00
browser_context : BrowserContext
2023-07-15 17:11:53 +08:00
2023-07-29 15:35:40 +08:00
def __init__ ( self ) - > None :
2023-06-16 19:35:43 +08:00
self . index_url = " https://www.xiaohongshu.com "
2024-07-25 00:44:46 +08:00
# self.user_agent = utils.get_user_agent()
2024-12-05 21:10:31 +08:00
self . user_agent = config . UA if config . UA else " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 "
2023-06-09 20:41:53 +08:00
2023-07-29 15:35:40 +08:00
async def start ( self ) - > None :
2023-12-08 00:10:04 +08:00
playwright_proxy_format , httpx_proxy_format = None , None
if config . ENABLE_IP_PROXY :
2024-11-26 13:37:53 +08:00
ip_proxy_pool = await create_ip_pool (
config . IP_PROXY_POOL_COUNT , enable_validate_ip = True
)
2023-12-08 00:10:04 +08:00
ip_proxy_info : IpInfoModel = await ip_proxy_pool . get_proxy ( )
2024-11-26 13:37:53 +08:00
playwright_proxy_format , httpx_proxy_format = self . format_proxy_info (
ip_proxy_info
)
2023-12-08 00:10:04 +08:00
2023-06-09 20:41:53 +08:00
async with async_playwright ( ) as playwright :
2023-07-15 17:11:53 +08:00
# Launch a browser context.
2023-06-09 20:41:53 +08:00
chromium = playwright . chromium
2023-07-15 17:11:53 +08:00
self . browser_context = await self . launch_browser (
2024-11-26 13:37:53 +08:00
chromium , None , self . user_agent , headless = config . HEADLESS
2023-06-09 20:41:53 +08:00
)
2023-07-15 17:11:53 +08:00
# stealth.min.js is a js script to prevent the website from detecting the crawler.
2023-06-09 20:41:53 +08:00
await self . browser_context . add_init_script ( path = " libs/stealth.min.js " )
2023-07-25 20:22:22 +08:00
# add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
2024-11-26 13:37:53 +08:00
await self . browser_context . add_cookies (
[
{
" name " : " webId " ,
" value " : " xxx123 " , # any value
" domain " : " .xiaohongshu.com " ,
" path " : " / " ,
}
]
)
2023-06-09 20:41:53 +08:00
self . context_page = await self . browser_context . new_page ( )
2023-06-16 19:35:43 +08:00
await self . context_page . goto ( self . index_url )
2023-06-09 20:41:53 +08:00
2023-07-15 17:11:53 +08:00
# Create a client to interact with the xiaohongshu website.
2023-12-08 00:10:04 +08:00
self . xhs_client = await self . create_xhs_client ( httpx_proxy_format )
2023-12-05 22:47:36 +08:00
if not await self . xhs_client . pong ( ) :
2024-03-30 21:17:33 +08:00
login_obj = XiaoHongShuLogin (
2024-06-09 09:35:52 +08:00
login_type = config . LOGIN_TYPE ,
2023-12-08 00:10:04 +08:00
login_phone = " " , # input your phone number
2023-07-15 17:11:53 +08:00
browser_context = self . browser_context ,
context_page = self . context_page ,
2024-11-26 13:37:53 +08:00
cookie_str = config . COOKIES ,
2023-07-15 17:11:53 +08:00
)
await login_obj . begin ( )
2024-11-26 13:37:53 +08:00
await self . xhs_client . update_cookies (
browser_context = self . browser_context
)
2023-06-09 20:41:53 +08:00
2024-06-09 09:35:52 +08:00
crawler_type_var . set ( config . CRAWLER_TYPE )
if config . CRAWLER_TYPE == " search " :
2023-11-18 13:38:11 +08:00
# Search for notes and retrieve their comment information.
await self . search ( )
2024-06-09 09:35:52 +08:00
elif config . CRAWLER_TYPE == " detail " :
2023-11-18 13:38:11 +08:00
# Get the information and comments of the specified post
await self . get_specified_notes ( )
2024-06-09 09:35:52 +08:00
elif config . CRAWLER_TYPE == " creator " :
2024-03-02 01:49:42 +08:00
# Get creator's information and their notes and comments
await self . get_creators_and_notes ( )
2023-11-18 13:38:11 +08:00
else :
pass
2023-06-09 20:41:53 +08:00
2023-12-23 01:04:08 +08:00
utils . logger . info ( " [XiaoHongShuCrawler.start] Xhs Crawler finished ... " )
2023-06-09 20:41:53 +08:00
2023-07-29 15:35:40 +08:00
async def search ( self ) - > None :
2023-07-15 17:11:53 +08:00
""" Search for notes and retrieve their comment information. """
2024-11-26 13:37:53 +08:00
utils . logger . info (
" [XiaoHongShuCrawler.search] Begin search xiaohongshu keywords "
)
2024-04-04 00:11:22 +08:00
xhs_limit_count = 20 # xhs limit page fixed value
if config . CRAWLER_MAX_NOTES_COUNT < xhs_limit_count :
config . CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
2024-06-09 09:35:52 +08:00
start_page = config . START_PAGE
for keyword in config . KEYWORDS . split ( " , " ) :
2024-08-23 08:29:24 +08:00
source_keyword_var . set ( keyword )
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.search] Current search keyword: { keyword } "
)
2023-06-16 19:35:43 +08:00
page = 1
2024-10-24 13:57:27 +08:00
search_id = get_search_id ( )
2024-11-26 13:37:53 +08:00
while (
page - start_page + 1
) * xhs_limit_count < = config . CRAWLER_MAX_NOTES_COUNT :
2024-04-12 00:52:47 +08:00
if page < start_page :
utils . logger . info ( f " [XiaoHongShuCrawler.search] Skip page { page } " )
page + = 1
continue
try :
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.search] search xhs keyword: { keyword } , page: { page } "
)
2024-12-03 18:25:21 +08:00
note_ids : List [ str ] = [ ]
xsec_tokens : List [ str ] = [ ]
2024-04-12 00:52:47 +08:00
notes_res = await self . xhs_client . get_note_by_keyword (
keyword = keyword ,
2024-10-24 13:57:27 +08:00
search_id = search_id ,
2024-04-12 00:52:47 +08:00
page = page ,
2024-11-26 13:37:53 +08:00
sort = (
SearchSortType ( config . SORT_TYPE )
if config . SORT_TYPE != " "
else SearchSortType . GENERAL
) ,
2024-04-12 00:52:47 +08:00
)
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.search] Search notes res: { notes_res } "
)
if not notes_res or not notes_res . get ( " has_more " , False ) :
2024-07-18 20:44:40 +08:00
utils . logger . info ( " No more content! " )
break
2024-04-12 00:52:47 +08:00
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list = [
2024-08-11 22:37:10 +08:00
self . get_note_detail_async_task (
2024-07-25 00:44:46 +08:00
note_id = post_item . get ( " id " ) ,
xsec_source = post_item . get ( " xsec_source " ) ,
xsec_token = post_item . get ( " xsec_token " ) ,
2024-11-26 13:37:53 +08:00
semaphore = semaphore ,
2024-07-25 00:44:46 +08:00
)
2024-04-12 00:52:47 +08:00
for post_item in notes_res . get ( " items " , { } )
2024-11-26 13:37:53 +08:00
if post_item . get ( " model_type " ) not in ( " rec_query " , " hot_query " )
2024-04-12 00:52:47 +08:00
]
note_details = await asyncio . gather ( * task_list )
for note_detail in note_details :
2024-07-25 00:44:46 +08:00
if note_detail :
2024-04-12 00:52:47 +08:00
await xhs_store . update_xhs_note ( note_detail )
2024-07-11 22:49:05 +08:00
await self . get_notice_media ( note_detail )
2024-12-03 18:25:21 +08:00
note_ids . append ( note_detail . get ( " note_id " ) )
xsec_tokens . append ( note_detail . get ( " xsec_token " ) )
2024-04-12 00:52:47 +08:00
page + = 1
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.search] Note details: { note_details } "
)
2024-12-03 18:25:21 +08:00
await self . batch_get_note_comments ( note_ids , xsec_tokens )
2024-04-12 00:52:47 +08:00
except DataFetchError :
2024-11-26 13:37:53 +08:00
utils . logger . error (
" [XiaoHongShuCrawler.search] Get note detail error "
)
2024-04-12 00:52:47 +08:00
break
2024-03-16 11:52:42 +08:00
2024-03-02 01:49:42 +08:00
async def get_creators_and_notes ( self ) - > None :
""" Get creator ' s notes and retrieve their comment information. """
2024-11-26 13:37:53 +08:00
utils . logger . info (
" [XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators "
)
2024-03-17 14:50:10 +08:00
for user_id in config . XHS_CREATOR_ID_LIST :
# get creator detail info from web html content
2024-11-26 13:37:53 +08:00
createor_info : Dict = await self . xhs_client . get_creator_info (
user_id = user_id
)
2024-03-17 14:50:10 +08:00
if createor_info :
await xhs_store . save_creator ( user_id , creator = createor_info )
2024-12-05 21:10:31 +08:00
# When proxy is not enabled, increase the crawling interval
if config . ENABLE_IP_PROXY :
crawl_interval = random . random ( )
else :
crawl_interval = random . uniform ( 1 , config . CRAWLER_MAX_SLEEP_SEC )
2024-03-17 14:50:10 +08:00
# Get all note information of the creator
all_notes_list = await self . xhs_client . get_all_notes_by_creator (
user_id = user_id ,
2024-12-05 21:10:31 +08:00
crawl_interval = crawl_interval ,
2024-11-26 13:37:53 +08:00
callback = self . fetch_creator_notes_detail ,
2024-03-17 14:50:10 +08:00
)
2024-03-02 01:49:42 +08:00
2024-12-03 18:25:21 +08:00
note_ids = [ ]
xsec_tokens = [ ]
for note_item in all_notes_list :
note_ids . append ( note_item . get ( " note_id " ) )
xsec_tokens . append ( note_item . get ( " xsec_token " ) )
await self . batch_get_note_comments ( note_ids , xsec_tokens )
2024-03-02 01:49:42 +08:00
2024-03-17 14:50:10 +08:00
async def fetch_creator_notes_detail ( self , note_list : List [ Dict ] ) :
"""
Concurrently obtain the specified post list and save the data
"""
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
task_list = [
2024-08-11 22:37:10 +08:00
self . get_note_detail_async_task (
2024-07-26 21:55:31 +08:00
note_id = post_item . get ( " note_id " ) ,
2024-07-25 00:44:46 +08:00
xsec_source = post_item . get ( " xsec_source " ) ,
xsec_token = post_item . get ( " xsec_token " ) ,
2024-11-26 13:37:53 +08:00
semaphore = semaphore ,
2024-07-25 00:44:46 +08:00
)
for post_item in note_list
2024-03-17 14:50:10 +08:00
]
2024-03-02 01:49:42 +08:00
2024-03-17 14:50:10 +08:00
note_details = await asyncio . gather ( * task_list )
for note_detail in note_details :
2024-07-25 00:44:46 +08:00
if note_detail :
2024-03-17 14:50:10 +08:00
await xhs_store . update_xhs_note ( note_detail )
2023-07-15 22:25:56 +08:00
2023-11-18 13:38:11 +08:00
async def get_specified_notes ( self ) :
2024-10-20 00:59:08 +08:00
"""
Get the information and comments of the specified post
must be specified note_id , xsec_source , xsec_token ⚠ ️ ⚠ ️ ⚠ ️
Returns :
2024-08-11 22:37:10 +08:00
2024-10-20 00:59:08 +08:00
"""
get_note_detail_task_list = [ ]
for full_note_url in config . XHS_SPECIFIED_NOTE_URL_LIST :
note_url_info : NoteUrlInfo = parse_note_info_from_note_url ( full_note_url )
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.get_specified_notes] Parse note url info: { note_url_info } "
)
2024-10-20 00:59:08 +08:00
crawler_task = self . get_note_detail_async_task (
note_id = note_url_info . note_id ,
xsec_source = note_url_info . xsec_source ,
xsec_token = note_url_info . xsec_token ,
2024-11-26 13:37:53 +08:00
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM ) ,
2024-10-20 00:59:08 +08:00
)
get_note_detail_task_list . append ( crawler_task )
2024-08-11 22:37:10 +08:00
need_get_comment_note_ids = [ ]
2024-12-03 18:25:21 +08:00
xsec_tokens = [ ]
2024-08-11 22:37:10 +08:00
note_details = await asyncio . gather ( * get_note_detail_task_list )
2024-08-04 14:54:03 +08:00
for note_detail in note_details :
2024-08-11 22:37:10 +08:00
if note_detail :
2024-10-20 00:59:08 +08:00
need_get_comment_note_ids . append ( note_detail . get ( " note_id " , " " ) )
2024-12-03 18:25:21 +08:00
xsec_tokens . append ( note_detail . get ( " xsec_token " , " " ) )
2024-08-04 14:54:03 +08:00
await xhs_store . update_xhs_note ( note_detail )
2024-12-03 18:25:21 +08:00
await self . batch_get_note_comments ( need_get_comment_note_ids , xsec_tokens )
2024-07-25 00:44:46 +08:00
2024-11-26 13:37:53 +08:00
async def get_note_detail_async_task (
self ,
note_id : str ,
xsec_source : str ,
xsec_token : str ,
semaphore : asyncio . Semaphore ,
) - > Optional [ Dict ] :
""" Get note detail
Args :
note_id :
xsec_source :
xsec_token :
semaphore :
2024-10-20 00:59:08 +08:00
2024-11-26 13:37:53 +08:00
Returns :
Dict : note detail
"""
note_detail_from_html , note_detail_from_api = None , None
2023-07-15 22:25:56 +08:00
async with semaphore :
2024-12-05 21:10:31 +08:00
# When proxy is not enabled, increase the crawling interval
if config . ENABLE_IP_PROXY :
crawl_interval = random . random ( )
else :
crawl_interval = random . uniform ( 1 , config . CRAWLER_MAX_SLEEP_SEC )
2023-07-15 22:25:56 +08:00
try :
2024-11-27 09:41:24 +08:00
# 尝试直接获取网页版笔记详情, 携带cookie
note_detail_from_html : Optional [ Dict ] = (
2024-11-26 13:37:53 +08:00
await self . xhs_client . get_note_by_id_from_html (
2024-11-27 09:41:24 +08:00
note_id , xsec_source , xsec_token , enable_cookie = True
2024-11-26 13:37:53 +08:00
)
)
2024-12-05 21:10:31 +08:00
time . sleep ( crawl_interval )
2024-11-26 13:37:53 +08:00
if not note_detail_from_html :
2024-11-27 09:41:24 +08:00
# 如果网页版笔记详情获取失败, 则尝试不使用cookie获取
2024-11-26 13:37:53 +08:00
note_detail_from_html = (
await self . xhs_client . get_note_by_id_from_html (
2024-11-27 09:41:24 +08:00
note_id , xsec_source , xsec_token , enable_cookie = False
2024-11-26 13:37:53 +08:00
)
)
2024-08-04 14:54:03 +08:00
utils . logger . error (
2024-11-26 13:37:53 +08:00
f " [XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: { note_id } "
)
if not note_detail_from_html :
# 如果网页版笔记详情获取失败, 则尝试API获取
2024-12-03 18:25:21 +08:00
note_detail_from_api : Optional [ Dict ] = (
await self . xhs_client . get_note_by_id (
note_id , xsec_source , xsec_token
)
2024-11-26 13:37:53 +08:00
)
note_detail = note_detail_from_html or note_detail_from_api
if note_detail :
note_detail . update (
{ " xsec_token " : xsec_token , " xsec_source " : xsec_source }
)
return note_detail
2023-07-15 22:25:56 +08:00
except DataFetchError as ex :
2024-11-26 13:37:53 +08:00
utils . logger . error (
f " [XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: { ex } "
)
2023-07-15 22:25:56 +08:00
return None
2023-11-18 13:38:11 +08:00
except KeyError as ex :
2024-01-16 00:40:07 +08:00
utils . logger . error (
2024-11-26 13:37:53 +08:00
f " [XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id: { note_id } , err: { ex } "
)
2023-11-18 13:38:11 +08:00
return None
2023-06-16 19:35:43 +08:00
2024-12-03 18:25:21 +08:00
async def batch_get_note_comments (
self , note_list : List [ str ] , xsec_tokens : List [ str ]
) :
2023-07-15 17:11:53 +08:00
""" Batch get note comments """
2024-03-16 11:52:42 +08:00
if not config . ENABLE_GET_COMMENTS :
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled "
)
2024-04-12 17:32:20 +08:00
return
2024-03-16 11:52:42 +08:00
2024-01-16 00:40:07 +08:00
utils . logger . info (
2024-11-26 13:37:53 +08:00
f " [XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: { note_list } "
)
2023-07-29 15:35:40 +08:00
semaphore = asyncio . Semaphore ( config . MAX_CONCURRENCY_NUM )
2023-06-16 19:35:43 +08:00
task_list : List [ Task ] = [ ]
2024-12-03 18:25:21 +08:00
for index , note_id in enumerate ( note_list ) :
2024-11-26 13:37:53 +08:00
task = asyncio . create_task (
2024-12-03 18:25:21 +08:00
self . get_comments (
note_id = note_id , xsec_token = xsec_tokens [ index ] , semaphore = semaphore
) ,
name = note_id ,
2024-11-26 13:37:53 +08:00
)
2023-06-16 19:35:43 +08:00
task_list . append ( task )
2024-04-12 17:32:20 +08:00
await asyncio . gather ( * task_list )
2023-06-09 20:41:53 +08:00
2024-12-03 18:25:21 +08:00
async def get_comments (
self , note_id : str , xsec_token : str , semaphore : asyncio . Semaphore
) :
2023-12-13 23:53:12 +08:00
""" Get note comments with keyword filtering and quantity limitation """
2023-07-15 22:25:56 +08:00
async with semaphore :
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.get_comments] Begin get note id comments { note_id } "
)
2024-12-05 21:10:31 +08:00
# When proxy is not enabled, increase the crawling interval
if config . ENABLE_IP_PROXY :
crawl_interval = random . random ( )
else :
crawl_interval = random . uniform ( 1 , config . CRAWLER_MAX_SLEEP_SEC )
2024-04-12 17:32:20 +08:00
await self . xhs_client . get_note_all_comments (
2024-01-16 00:40:07 +08:00
note_id = note_id ,
2024-12-03 18:25:21 +08:00
xsec_token = xsec_token ,
2024-12-05 21:10:31 +08:00
crawl_interval = crawl_interval ,
2024-10-22 20:33:10 +08:00
callback = xhs_store . batch_update_xhs_note_comments ,
2024-11-26 13:37:53 +08:00
max_count = CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ,
2024-01-16 00:40:07 +08:00
)
2023-07-15 17:11:53 +08:00
2023-12-08 00:10:04 +08:00
@staticmethod
2024-11-26 13:37:53 +08:00
def format_proxy_info (
ip_proxy_info : IpInfoModel ,
) - > Tuple [ Optional [ Dict ] , Optional [ Dict ] ] :
2023-12-08 00:10:04 +08:00
""" format proxy info for playwright and httpx """
2023-07-15 17:11:53 +08:00
playwright_proxy = {
2023-12-08 00:10:04 +08:00
" server " : f " { ip_proxy_info . protocol } { ip_proxy_info . ip } : { ip_proxy_info . port } " ,
" username " : ip_proxy_info . user ,
" password " : ip_proxy_info . password ,
}
httpx_proxy = {
2024-01-13 15:50:02 +08:00
f " { ip_proxy_info . protocol } " : f " http:// { ip_proxy_info . user } : { ip_proxy_info . password } @ { ip_proxy_info . ip } : { ip_proxy_info . port } "
2023-07-15 17:11:53 +08:00
}
2023-12-08 00:10:04 +08:00
return playwright_proxy , httpx_proxy
2023-07-15 17:11:53 +08:00
2024-03-30 21:17:33 +08:00
async def create_xhs_client ( self , httpx_proxy : Optional [ str ] ) - > XiaoHongShuClient :
2023-07-15 17:11:53 +08:00
""" Create xhs client """
2024-11-26 13:37:53 +08:00
utils . logger . info (
" [XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ... "
)
cookie_str , cookie_dict = utils . convert_cookies (
await self . browser_context . cookies ( )
)
2024-03-30 21:17:33 +08:00
xhs_client_obj = XiaoHongShuClient (
2023-07-15 17:11:53 +08:00
proxies = httpx_proxy ,
headers = {
" User-Agent " : self . user_agent ,
" Cookie " : cookie_str ,
" Origin " : " https://www.xiaohongshu.com " ,
" Referer " : " https://www.xiaohongshu.com " ,
2024-11-26 13:37:53 +08:00
" Content-Type " : " application/json;charset=UTF-8 " ,
2023-07-15 17:11:53 +08:00
} ,
playwright_page = self . context_page ,
cookie_dict = cookie_dict ,
)
return xhs_client_obj
2023-07-15 21:30:12 +08:00
async def launch_browser (
2024-11-26 13:37:53 +08:00
self ,
chromium : BrowserType ,
playwright_proxy : Optional [ Dict ] ,
user_agent : Optional [ str ] ,
headless : bool = True ,
2023-07-15 21:30:12 +08:00
) - > BrowserContext :
2023-07-15 17:11:53 +08:00
""" Launch browser and create browser context """
2024-11-26 13:37:53 +08:00
utils . logger . info (
" [XiaoHongShuCrawler.launch_browser] Begin create browser context ... "
)
2023-07-15 17:11:53 +08:00
if config . SAVE_LOGIN_STATE :
# feat issue #14
2023-07-15 21:30:12 +08:00
# we will save login state to avoid login every time
2024-11-26 13:37:53 +08:00
user_data_dir = os . path . join (
os . getcwd ( ) , " browser_data " , config . USER_DATA_DIR % config . PLATFORM
) # type: ignore
2023-07-15 17:11:53 +08:00
browser_context = await chromium . launch_persistent_context (
2023-07-15 21:30:12 +08:00
user_data_dir = user_data_dir ,
2023-07-15 17:11:53 +08:00
accept_downloads = True ,
headless = headless ,
2023-07-29 15:35:40 +08:00
proxy = playwright_proxy , # type: ignore
2023-07-15 17:11:53 +08:00
viewport = { " width " : 1920 , " height " : 1080 } ,
2024-11-26 13:37:53 +08:00
user_agent = user_agent ,
2023-07-15 17:11:53 +08:00
)
return browser_context
else :
2023-07-29 15:35:40 +08:00
browser = await chromium . launch ( headless = headless , proxy = playwright_proxy ) # type: ignore
2023-07-15 17:11:53 +08:00
browser_context = await browser . new_context (
2024-11-26 13:37:53 +08:00
viewport = { " width " : 1920 , " height " : 1080 } , user_agent = user_agent
2023-07-15 17:11:53 +08:00
)
return browser_context
async def close ( self ) :
""" Close browser context """
await self . browser_context . close ( )
2024-07-11 22:49:05 +08:00
utils . logger . info ( " [XiaoHongShuCrawler.close] Browser context closed ... " )
async def get_notice_media ( self , note_detail : Dict ) :
if not config . ENABLE_GET_IMAGES :
2024-11-26 13:37:53 +08:00
utils . logger . info (
f " [XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled "
)
2024-07-11 22:49:05 +08:00
return
await self . get_note_images ( note_detail )
await self . get_notice_video ( note_detail )
async def get_note_images ( self , note_item : Dict ) :
"""
get note images . please use get_notice_media
: param note_item :
: return :
"""
if not config . ENABLE_GET_IMAGES :
return
note_id = note_item . get ( " note_id " )
image_list : List [ Dict ] = note_item . get ( " image_list " , [ ] )
for img in image_list :
2024-11-26 13:37:53 +08:00
if img . get ( " url_default " ) != " " :
img . update ( { " url " : img . get ( " url_default " ) } )
2024-07-11 22:49:05 +08:00
if not image_list :
return
picNum = 0
for pic in image_list :
url = pic . get ( " url " )
if not url :
continue
content = await self . xhs_client . get_note_media ( url )
if content is None :
continue
extension_file_name = f " { picNum } .jpg "
picNum + = 1
await xhs_store . update_xhs_note_image ( note_id , content , extension_file_name )
async def get_notice_video ( self , note_item : Dict ) :
"""
get note images . please use get_notice_media
: param note_item :
: return :
"""
if not config . ENABLE_GET_IMAGES :
return
note_id = note_item . get ( " note_id " )
videos = xhs_store . get_video_url_arr ( note_item )
if not videos :
return
videoNum = 0
for url in videos :
content = await self . xhs_client . get_note_media ( url )
if content is None :
continue
extension_file_name = f " { videoNum } .mp4 "
videoNum + = 1
await xhs_store . update_xhs_note_image ( note_id , content , extension_file_name )