Merge pull request #687 from 2513502304/main

添加抖音平台爬取短视频以及帖子图片的逻辑,并规范目前实现爬取媒体的抖音、b站、小红书、微博四个平台存储逻辑
This commit is contained in:
程序员阿江-Relakkes
2025-07-30 23:06:35 +08:00
committed by GitHub
19 changed files with 944 additions and 1064 deletions

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, Optional from typing import Dict, Optional
@@ -16,6 +15,7 @@ from playwright.async_api import BrowserContext, BrowserType, Playwright
class AbstractCrawler(ABC): class AbstractCrawler(ABC):
@abstractmethod @abstractmethod
async def start(self): async def start(self):
""" """
@@ -31,8 +31,7 @@ class AbstractCrawler(ABC):
pass pass
@abstractmethod @abstractmethod
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str], async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext:
headless: bool = True) -> BrowserContext:
""" """
launch browser launch browser
:param chromium: chromium browser :param chromium: chromium browser
@@ -43,8 +42,7 @@ class AbstractCrawler(ABC):
""" """
pass pass
async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict], async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext:
user_agent: Optional[str], headless: bool = True) -> BrowserContext:
""" """
使用CDP模式启动浏览器可选实现 使用CDP模式启动浏览器可选实现
:param playwright: playwright实例 :param playwright: playwright实例
@@ -58,6 +56,7 @@ class AbstractCrawler(ABC):
class AbstractLogin(ABC): class AbstractLogin(ABC):
@abstractmethod @abstractmethod
async def begin(self): async def begin(self):
pass pass
@@ -76,6 +75,7 @@ class AbstractLogin(ABC):
class AbstractStore(ABC): class AbstractStore(ABC):
@abstractmethod @abstractmethod
async def store_content(self, content_item: Dict): async def store_content(self, content_item: Dict):
pass pass
@@ -99,7 +99,16 @@ class AbstractStoreImage(ABC):
pass pass
class AbstractStoreVideo(ABC):
# TODO: support all platform
# only weibo is supported
# @abstractmethod
async def store_video(self, video_content_item: Dict):
pass
class AbstractApiClient(ABC): class AbstractApiClient(ABC):
@abstractmethod @abstractmethod
async def request(self, method, url, **kwargs): async def request(self, method, url, **kwargs):
pass pass

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# 基础配置 # 基础配置
PLATFORM = "xhs" # 平台xhs | dy | ks | bili | wb | tieba | zhihu PLATFORM = "xhs" # 平台xhs | dy | ks | bili | wb | tieba | zhihu
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔 KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
@@ -77,8 +76,8 @@ CRAWLER_MAX_NOTES_COUNT = 200
# 并发爬虫数量控制 # 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1 MAX_CONCURRENCY_NUM = 1
# 是否开启爬图片模式, 默认不开启爬图片 # 是否开启爬媒体模式(包含图片或视频资源),默认不开启爬媒体
ENABLE_GET_IMAGES = False ENABLE_GET_MEIDAS = False
# 是否开启爬评论模式, 默认开启爬评论 # 是否开启爬评论模式, 默认开启爬评论
ENABLE_GET_COMMENTS = True ENABLE_GET_COMMENTS = True

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2023/12/2 18:44 # @Time : 2023/12/2 18:44
@@ -32,14 +31,15 @@ from .help import BilibiliSign
class BilibiliClient(AbstractApiClient): class BilibiliClient(AbstractApiClient):
def __init__( def __init__(
self, self,
timeout=10, timeout=60, # 若开启爬取媒体选项b 站的长视频需要更久的超时时间
proxies=None, proxies=None,
*, *,
headers: Dict[str, str], headers: Dict[str, str],
playwright_page: Page, playwright_page: Page,
cookie_dict: Dict[str, str], cookie_dict: Dict[str, str],
): ):
self.proxies = proxies self.proxies = proxies
self.timeout = timeout self.timeout = timeout
@@ -50,10 +50,7 @@ class BilibiliClient(AbstractApiClient):
async def request(self, method, url, **kwargs) -> Any: async def request(self, method, url, **kwargs) -> Any:
async with httpx.AsyncClient(proxies=self.proxies) as client: async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request( response = await client.request(method, url, timeout=self.timeout, **kwargs)
method, url, timeout=self.timeout,
**kwargs
)
try: try:
data: Dict = response.json() data: Dict = response.json()
except json.JSONDecodeError: except json.JSONDecodeError:
@@ -111,8 +108,7 @@ class BilibiliClient(AbstractApiClient):
async def post(self, uri: str, data: dict) -> Dict: async def post(self, uri: str, data: dict) -> Dict:
data = await self.pre_request_data(data) data = await self.pre_request_data(data)
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
data=json_str, headers=self.headers)
async def pong(self) -> bool: async def pong(self) -> bool:
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
@@ -122,12 +118,10 @@ class BilibiliClient(AbstractApiClient):
check_login_uri = "/x/web-interface/nav" check_login_uri = "/x/web-interface/nav"
response = await self.get(check_login_uri) response = await self.get(check_login_uri)
if response.get("isLogin"): if response.get("isLogin"):
utils.logger.info( utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
"[BilibiliClient.pong] Use cache login state get web interface successfull!")
ping_flag = True ping_flag = True
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
ping_flag = False ping_flag = False
return ping_flag return ping_flag
@@ -136,10 +130,15 @@ class BilibiliClient(AbstractApiClient):
self.headers["Cookie"] = cookie_str self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
async def search_video_by_keyword(self, keyword: str, page: int = 1, page_size: int = 20, async def search_video_by_keyword(
order: SearchOrderType = SearchOrderType.DEFAULT, self,
pubtime_begin_s: int = 0, pubtime_end_s: int = 0) -> Dict: keyword: str,
page: int = 1,
page_size: int = 20,
order: SearchOrderType = SearchOrderType.DEFAULT,
pubtime_begin_s: int = 0,
pubtime_end_s: int = 0,
) -> Dict:
""" """
KuaiShou web search api KuaiShou web search api
:param keyword: 搜索关键词 :param keyword: 搜索关键词
@@ -210,11 +209,12 @@ class BilibiliClient(AbstractApiClient):
else: else:
return response.content return response.content
async def get_video_comments(self, async def get_video_comments(
video_id: str, self,
order_mode: CommentOrderType = CommentOrderType.DEFAULT, video_id: str,
next: int = 0 order_mode: CommentOrderType = CommentOrderType.DEFAULT,
) -> Dict: next: int = 0,
) -> Dict:
"""get video comments """get video comments
:param video_id: 视频 ID :param video_id: 视频 ID
:param order_mode: 排序方式 :param order_mode: 排序方式
@@ -222,18 +222,17 @@ class BilibiliClient(AbstractApiClient):
:return: :return:
""" """
uri = "/x/v2/reply/wbi/main" uri = "/x/v2/reply/wbi/main"
post_data = { post_data = {"oid": video_id, "mode": order_mode.value, "type": 1, "ps": 20, "next": next}
"oid": video_id,
"mode": order_mode.value,
"type": 1,
"ps": 20,
"next": next
}
return await self.get(uri, post_data) return await self.get(uri, post_data)
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, async def get_video_all_comments(
callback: Optional[Callable] = None, self,
max_count: int = 10,): video_id: str,
crawl_interval: float = 1.0,
is_fetch_sub_comments=False,
callback: Optional[Callable] = None,
max_count: int = 10,
):
""" """
get video all comments include sub comments get video all comments include sub comments
:param video_id: :param video_id:
@@ -256,15 +255,11 @@ class BilibiliClient(AbstractApiClient):
break # Success break # Success
except DataFetchError as e: except DataFetchError as e:
if attempt < max_retries - 1: if attempt < max_retries - 1:
delay = 5 * (2 ** attempt) + random.uniform(0, 1) delay = 5 * (2**attempt) + random.uniform(0, 1)
utils.logger.warning( utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})")
f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})"
)
await asyncio.sleep(delay) await asyncio.sleep(delay)
else: else:
utils.logger.error( utils.logger.error(f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}")
f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}"
)
is_end = True is_end = True
break break
if not comments_res: if not comments_res:
@@ -292,10 +287,7 @@ class BilibiliClient(AbstractApiClient):
for comment in comment_list: for comment in comment_list:
comment_id = comment['rpid'] comment_id = comment['rpid']
if (comment.get("rcount", 0) > 0): if (comment.get("rcount", 0) > 0):
{ {await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)}
await self.get_video_all_level_two_comments(
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
}
if len(result) + len(comment_list) > max_count: if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)] comment_list = comment_list[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
@@ -306,14 +298,15 @@ class BilibiliClient(AbstractApiClient):
continue continue
return result return result
async def get_video_all_level_two_comments(self, async def get_video_all_level_two_comments(
video_id: str, self,
level_one_comment_id: int, video_id: str,
order_mode: CommentOrderType, level_one_comment_id: int,
ps: int = 10, order_mode: CommentOrderType,
crawl_interval: float = 1.0, ps: int = 10,
callback: Optional[Callable] = None, crawl_interval: float = 1.0,
) -> Dict: callback: Optional[Callable] = None,
) -> Dict:
""" """
get video all level two comments for a level one comment get video all level two comments for a level one comment
:param video_id: 视频 ID :param video_id: 视频 ID
@@ -327,8 +320,7 @@ class BilibiliClient(AbstractApiClient):
pn = 1 pn = 1
while True: while True:
result = await self.get_video_level_two_comments( result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode)
video_id, level_one_comment_id, pn, ps, order_mode)
comment_list: List[Dict] = result.get("replies", []) comment_list: List[Dict] = result.get("replies", [])
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(video_id, comment_list) await callback(video_id, comment_list)
@@ -338,13 +330,14 @@ class BilibiliClient(AbstractApiClient):
pn += 1 pn += 1
async def get_video_level_two_comments(self, async def get_video_level_two_comments(
video_id: str, self,
level_one_comment_id: int, video_id: str,
pn: int, level_one_comment_id: int,
ps: int, pn: int,
order_mode: CommentOrderType, ps: int,
) -> Dict: order_mode: CommentOrderType,
) -> Dict:
"""get video level two comments """get video level two comments
:param video_id: 视频 ID :param video_id: 视频 ID
:param level_one_comment_id: 一级评论 ID :param level_one_comment_id: 一级评论 ID
@@ -393,11 +386,12 @@ class BilibiliClient(AbstractApiClient):
} }
return await self.get(uri, post_data) return await self.get(uri, post_data)
async def get_creator_fans(self, async def get_creator_fans(
creator_id: int, self,
pn: int, creator_id: int,
ps: int = 24, pn: int,
) -> Dict: ps: int = 24,
) -> Dict:
""" """
get creator fans get creator fans
:param creator_id: 创作者 ID :param creator_id: 创作者 ID
@@ -411,15 +405,15 @@ class BilibiliClient(AbstractApiClient):
"pn": pn, "pn": pn,
"ps": ps, "ps": ps,
"gaia_source": "main_web", "gaia_source": "main_web",
} }
return await self.get(uri, post_data) return await self.get(uri, post_data)
async def get_creator_followings(self, async def get_creator_followings(
creator_id: int, self,
pn: int, creator_id: int,
ps: int = 24, pn: int,
) -> Dict: ps: int = 24,
) -> Dict:
""" """
get creator followings get creator followings
:param creator_id: 创作者 ID :param creator_id: 创作者 ID
@@ -452,9 +446,13 @@ class BilibiliClient(AbstractApiClient):
return await self.get(uri, post_data) return await self.get(uri, post_data)
async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0, async def get_creator_all_fans(
callback: Optional[Callable] = None, self,
max_count: int = 100) -> List: creator_info: Dict,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 100,
) -> List:
""" """
get creator all fans get creator all fans
:param creator_info: :param creator_info:
@@ -482,9 +480,13 @@ class BilibiliClient(AbstractApiClient):
result.extend(fans_list) result.extend(fans_list)
return result return result
async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0, async def get_creator_all_followings(
callback: Optional[Callable] = None, self,
max_count: int = 100) -> List: creator_info: Dict,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 100,
) -> List:
""" """
get creator all followings get creator all followings
:param creator_info: :param creator_info:
@@ -512,9 +514,13 @@ class BilibiliClient(AbstractApiClient):
result.extend(followings_list) result.extend(followings_list)
return result return result
async def get_creator_all_dynamics(self, creator_info: Dict, crawl_interval: float = 1.0, async def get_creator_all_dynamics(
callback: Optional[Callable] = None, self,
max_count: int = 20) -> List: creator_info: Dict,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_count: int = 20,
) -> List:
""" """
get creator all followings get creator all followings
:param creator_info: :param creator_info:

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2023/12/2 18:44 # @Time : 2023/12/2 18:44
@@ -59,13 +58,9 @@ class BilibiliCrawler(AbstractCrawler):
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool( ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info( playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
ip_proxy_info
)
async with async_playwright() as playwright: async with async_playwright() as playwright:
# 根据配置选择启动模式 # 根据配置选择启动模式
@@ -81,9 +76,7 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器") utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器")
# Launch a browser context. # Launch a browser context.
chromium = playwright.chromium chromium = playwright.chromium
self.browser_context = await self.launch_browser( self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS)
chromium, None, self.user_agent, headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
@@ -100,9 +93,7 @@ class BilibiliCrawler(AbstractCrawler):
cookie_str=config.COOKIES, cookie_str=config.COOKIES,
) )
await login_obj.begin() await login_obj.begin()
await self.bili_client.update_cookies( await self.bili_client.update_cookies(browser_context=self.browser_context)
browser_context=self.browser_context
)
crawler_type_var.set(config.CRAWLER_TYPE) crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
@@ -136,7 +127,8 @@ class BilibiliCrawler(AbstractCrawler):
@staticmethod @staticmethod
async def get_pubtime_datetime( async def get_pubtime_datetime(
start: str = config.START_DAY, end: str = config.END_DAY start: str = config.START_DAY,
end: str = config.END_DAY,
) -> Tuple[str, str]: ) -> Tuple[str, str]:
""" """
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s 获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
@@ -158,17 +150,11 @@ class BilibiliCrawler(AbstractCrawler):
start_day: datetime = datetime.strptime(start, "%Y-%m-%d") start_day: datetime = datetime.strptime(start, "%Y-%m-%d")
end_day: datetime = datetime.strptime(end, "%Y-%m-%d") end_day: datetime = datetime.strptime(end, "%Y-%m-%d")
if start_day > end_day: if start_day > end_day:
raise ValueError( raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end")
"Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end"
)
elif start_day == end_day: # 搜索同一天的内容 elif start_day == end_day: # 搜索同一天的内容
end_day = ( end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second
start_day + timedelta(days=1) - timedelta(seconds=1)
) # 则将 end_day 设置为 start_day + 1 day - 1 second
else: # 搜索 start 至 end else: # 搜索 start 至 end
end_day = ( end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second
end_day + timedelta(days=1) - timedelta(seconds=1)
) # 则将 end_day 设置为 end_day + 1 day - 1 second
# 将其重新转换为时间戳 # 将其重新转换为时间戳
return str(int(start_day.timestamp())), str(int(end_day.timestamp())) return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
@@ -177,32 +163,22 @@ class BilibiliCrawler(AbstractCrawler):
search bilibili video with keywords in normal mode search bilibili video with keywords in normal mode
:return: :return:
""" """
utils.logger.info( utils.logger.info("[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords")
"[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords"
)
bili_limit_count = 20 # bilibili limit page fixed value bili_limit_count = 20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
start_page = config.START_PAGE # start page number start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword) source_keyword_var.set(keyword)
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}")
f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}"
)
page = 1 page = 1
while ( while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
page - start_page + 1
) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page: if page < start_page:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Skip page: {page}")
f"[BilibiliCrawler.search_by_keywords] Skip page: {page}"
)
page += 1 page += 1
continue continue
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}")
f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}"
)
video_id_list: List[str] = [] video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword( videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword, keyword=keyword,
@@ -215,24 +191,15 @@ class BilibiliCrawler(AbstractCrawler):
video_list: List[Dict] = videos_res.get("result") video_list: List[Dict] = videos_res.get("result")
if not video_list: if not video_list:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword.")
f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword."
)
break break
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [] task_list = []
try: try:
task_list = [ task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
self.get_video_info_task(
aid=video_item.get("aid"), bvid="", semaphore=semaphore
)
for video_item in video_list
]
except Exception as e: except Exception as e:
utils.logger.warning( utils.logger.warning(f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}")
f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}"
)
video_items = await asyncio.gather(*task_list) video_items = await asyncio.gather(*task_list)
for video_item in video_items: for video_item in video_items:
if video_item: if video_item:
@@ -248,74 +215,40 @@ class BilibiliCrawler(AbstractCrawler):
Search bilibili video with keywords in a given time range. Search bilibili video with keywords in a given time range.
:param daily_limit: if True, strictly limit the number of notes per day and total. :param daily_limit: if True, strictly limit the number of notes per day and total.
""" """
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}")
f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}"
)
bili_limit_count = 20 bili_limit_count = 20
start_page = config.START_PAGE start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword) source_keyword_var.set(keyword)
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}"
)
total_notes_crawled_for_keyword = 0 total_notes_crawled_for_keyword = 0
for day in pd.date_range( for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq="D"):
start=config.START_DAY, end=config.END_DAY, freq="D" if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
): utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
if (
daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
utils.logger.info(
f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days."
)
break break
if ( if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
not daily_limit utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
utils.logger.info(
f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days."
)
break break
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime( pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime("%Y-%m-%d"), end=day.strftime("%Y-%m-%d"))
start=day.strftime("%Y-%m-%d"), end=day.strftime("%Y-%m-%d")
)
page = 1 page = 1
notes_count_this_day = 0 notes_count_this_day = 0
while True: while True:
if notes_count_this_day >= config.MAX_NOTES_PER_DAY: if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}."
)
break break
if ( if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
daily_limit utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
utils.logger.info(
f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'."
)
break break
if ( if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
not daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
break break
try: try:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}"
)
video_id_list: List[str] = [] video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword( videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword, keyword=keyword,
@@ -328,33 +261,18 @@ class BilibiliCrawler(AbstractCrawler):
video_list: List[Dict] = videos_res.get("result") video_list: List[Dict] = videos_res.get("result")
if not video_list: if not video_list:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day.")
f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day."
)
break break
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
self.get_video_info_task(
aid=video_item.get("aid"), bvid="", semaphore=semaphore
)
for video_item in video_list
]
video_items = await asyncio.gather(*task_list) video_items = await asyncio.gather(*task_list)
for video_item in video_items: for video_item in video_items:
if video_item: if video_item:
if ( if (daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
break break
if ( if (not daily_limit and total_notes_crawled_for_keyword >= config.CRAWLER_MAX_NOTES_COUNT):
not daily_limit
and total_notes_crawled_for_keyword
>= config.CRAWLER_MAX_NOTES_COUNT
):
break break
if notes_count_this_day >= config.MAX_NOTES_PER_DAY: if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
break break
@@ -369,9 +287,7 @@ class BilibiliCrawler(AbstractCrawler):
await self.batch_get_video_comments(video_id_list) await self.batch_get_video_comments(video_id_list)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}")
f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}"
)
break break
async def batch_get_video_comments(self, video_id_list: List[str]): async def batch_get_video_comments(self, video_id_list: List[str]):
@@ -381,20 +297,14 @@ class BilibiliCrawler(AbstractCrawler):
:return: :return:
""" """
if not config.ENABLE_GET_COMMENTS: if not config.ENABLE_GET_COMMENTS:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled"
)
return return
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for video_id in video_id_list: for video_id in video_id_list:
task = asyncio.create_task( task = asyncio.create_task(self.get_comments(video_id, semaphore), name=video_id)
self.get_comments(video_id, semaphore), name=video_id
)
task_list.append(task) task_list.append(task)
await asyncio.gather(*task_list) await asyncio.gather(*task_list)
@@ -407,9 +317,7 @@ class BilibiliCrawler(AbstractCrawler):
""" """
async with semaphore: async with semaphore:
try: try:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ..."
)
await asyncio.sleep(random.uniform(0.5, 1.5)) await asyncio.sleep(random.uniform(0.5, 1.5))
await self.bili_client.get_video_all_comments( await self.bili_client.get_video_all_comments(
video_id=video_id, video_id=video_id,
@@ -420,13 +328,9 @@ class BilibiliCrawler(AbstractCrawler):
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}"
)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}"
)
# Propagate the exception to be caught by the main loop # Propagate the exception to be caught by the main loop
raise raise
@@ -452,10 +356,7 @@ class BilibiliCrawler(AbstractCrawler):
:return: :return:
""" """
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore)
for video_id in bvids_list
]
video_details = await asyncio.gather(*task_list) video_details = await asyncio.gather(*task_list)
video_aids_list = [] video_aids_list = []
for video_detail in video_details: for video_detail in video_details:
@@ -469,9 +370,7 @@ class BilibiliCrawler(AbstractCrawler):
await self.get_bilibili_video(video_detail, semaphore) await self.get_bilibili_video(video_detail, semaphore)
await self.batch_get_video_comments(video_aids_list) await self.batch_get_video_comments(video_aids_list)
async def get_video_info_task( async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
self, aid: int, bvid: str, semaphore: asyncio.Semaphore
) -> Optional[Dict]:
""" """
Get video detail task Get video detail task
:param aid: :param aid:
@@ -484,19 +383,13 @@ class BilibiliCrawler(AbstractCrawler):
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid) result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
return result return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}"
)
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}"
)
return None return None
async def get_video_play_url_task( async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]:
self, aid: int, cid: int, semaphore: asyncio.Semaphore
) -> Union[Dict, None]:
""" """
Get video play url Get video play url
:param aid: :param aid:
@@ -509,30 +402,20 @@ class BilibiliCrawler(AbstractCrawler):
result = await self.bili_client.get_video_play_url(aid=aid, cid=cid) result = await self.bili_client.get_video_play_url(aid=aid, cid=cid)
return result return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}")
f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}"
)
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}")
f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}"
)
return None return None
async def create_bilibili_client( async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
self, httpx_proxy: Optional[str]
) -> BilibiliClient:
""" """
create bilibili client create bilibili client
:param httpx_proxy: httpx proxy :param httpx_proxy: httpx proxy
:return: bilibili client :return: bilibili client
""" """
utils.logger.info( utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ..." cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
)
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
bilibili_client_obj = BilibiliClient( bilibili_client_obj = BilibiliClient(
proxies=httpx_proxy, proxies=httpx_proxy,
headers={ headers={
@@ -562,30 +445,27 @@ class BilibiliCrawler(AbstractCrawler):
:param headless: headless mode :param headless: headless mode
:return: browser context :return: browser context
""" """
utils.logger.info( utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
"[BilibiliCrawler.launch_browser] Begin create browser context ..."
)
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
user_data_dir = os.path.join( user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,
headless=headless, headless=headless,
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent, user_agent=user_agent,
) )
return browser_context return browser_context
else: else:
# type: ignore # type: ignore
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
browser_context = await browser.new_context( browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
)
return browser_context return browser_context
async def launch_browser_with_cdp( async def launch_browser_with_cdp(
@@ -614,14 +494,10 @@ class BilibiliCrawler(AbstractCrawler):
return browser_context return browser_context
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败回退到标准模式: {e}")
f"[BilibiliCrawler] CDP模式启动失败回退到标准模式: {e}"
)
# 回退到标准模式 # 回退到标准模式
chromium = playwright.chromium chromium = playwright.chromium
return await self.launch_browser( return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
chromium, playwright_proxy, user_agent, headless
)
async def close(self): async def close(self):
"""Close browser context""" """Close browser context"""
@@ -634,13 +510,9 @@ class BilibiliCrawler(AbstractCrawler):
await self.browser_context.close() await self.browser_context.close()
utils.logger.info("[BilibiliCrawler.close] Browser context closed ...") utils.logger.info("[BilibiliCrawler.close] Browser context closed ...")
except TargetClosedError: except TargetClosedError:
utils.logger.warning( utils.logger.warning("[BilibiliCrawler.close] Browser context was already closed.")
"[BilibiliCrawler.close] Browser context was already closed."
)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.close] An error occurred during close: {e}")
f"[BilibiliCrawler.close] An error occurred during close: {e}"
)
async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore): async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
""" """
@@ -649,19 +521,15 @@ class BilibiliCrawler(AbstractCrawler):
:param semaphore: :param semaphore:
:return: :return:
""" """
if not config.ENABLE_GET_IMAGES: if not config.ENABLE_GET_MEIDAS:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled")
f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled"
)
return return
video_item_view: Dict = video_item.get("View") video_item_view: Dict = video_item.get("View")
aid = video_item_view.get("aid") aid = video_item_view.get("aid")
cid = video_item_view.get("cid") cid = video_item_view.get("cid")
result = await self.get_video_play_url_task(aid, cid, semaphore) result = await self.get_video_play_url_task(aid, cid, semaphore)
if result is None: if result is None:
utils.logger.info( utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed")
"[BilibiliCrawler.get_bilibili_video] get video play url failed"
)
return return
durl_list = result.get("durl") durl_list = result.get("durl")
max_size = -1 max_size = -1
@@ -672,9 +540,7 @@ class BilibiliCrawler(AbstractCrawler):
max_size = size max_size = size
video_url = durl.get("url") video_url = durl.get("url")
if video_url == "": if video_url == "":
utils.logger.info( utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed")
"[BilibiliCrawler.get_bilibili_video] get video url failed"
)
return return
content = await self.bili_client.get_video_media(video_url) content = await self.bili_client.get_video_media(video_url)
@@ -687,25 +553,17 @@ class BilibiliCrawler(AbstractCrawler):
""" """
creator_id_list: get details for creator from creator_id_list creator_id_list: get details for creator from creator_id_list
""" """
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator" utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
)
utils.logger.info(
f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
try: try:
for creator_id in creator_id_list: for creator_id in creator_id_list:
task = asyncio.create_task( task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
self.get_creator_details(creator_id, semaphore), name=creator_id
)
task_list.append(task) task_list.append(task)
except Exception as e: except Exception as e:
utils.logger.warning( utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}"
)
await asyncio.gather(*task_list) await asyncio.gather(*task_list)
@@ -717,9 +575,7 @@ class BilibiliCrawler(AbstractCrawler):
:return: :return:
""" """
async with semaphore: async with semaphore:
creator_unhandled_info: Dict = await self.bili_client.get_creator_info( creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
creator_id
)
creator_info: Dict = { creator_info: Dict = {
"id": creator_id, "id": creator_id,
"name": creator_unhandled_info.get("name"), "name": creator_unhandled_info.get("name"),
@@ -740,9 +596,7 @@ class BilibiliCrawler(AbstractCrawler):
creator_id = creator_info["id"] creator_id = creator_info["id"]
async with semaphore: async with semaphore:
try: try:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ..."
)
await self.bili_client.get_creator_all_fans( await self.bili_client.get_creator_all_fans(
creator_info=creator_info, creator_info=creator_info,
crawl_interval=random.random(), crawl_interval=random.random(),
@@ -751,13 +605,9 @@ class BilibiliCrawler(AbstractCrawler):
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}"
)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}"
)
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore): async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
""" """
@@ -769,9 +619,7 @@ class BilibiliCrawler(AbstractCrawler):
creator_id = creator_info["id"] creator_id = creator_info["id"]
async with semaphore: async with semaphore:
try: try:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ..."
)
await self.bili_client.get_creator_all_followings( await self.bili_client.get_creator_all_followings(
creator_info=creator_info, creator_info=creator_info,
crawl_interval=random.random(), crawl_interval=random.random(),
@@ -780,13 +628,9 @@ class BilibiliCrawler(AbstractCrawler):
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}"
)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}"
)
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore): async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
""" """
@@ -798,9 +642,7 @@ class BilibiliCrawler(AbstractCrawler):
creator_id = creator_info["id"] creator_id = creator_info["id"]
async with semaphore: async with semaphore:
try: try:
utils.logger.info( utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ..."
)
await self.bili_client.get_creator_all_dynamics( await self.bili_client.get_creator_all_dynamics(
creator_info=creator_info, creator_info=creator_info,
crawl_interval=random.random(), crawl_interval=random.random(),
@@ -809,10 +651,6 @@ class BilibiliCrawler(AbstractCrawler):
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}"
)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")
f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}"
)

View File

@@ -8,14 +8,13 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio import asyncio
import copy import copy
import json import json
import urllib.parse import urllib.parse
from typing import Any, Callable, Dict, Optional from typing import Any, Callable, Dict, Union, Optional
import requests import httpx
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
@@ -27,15 +26,16 @@ from .field import *
from .help import * from .help import *
class DOUYINClient(AbstractApiClient): class DouYinClient(AbstractApiClient):
def __init__( def __init__(
self, self,
timeout=30, timeout=30, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
proxies=None, proxies=None,
*, *,
headers: Dict, headers: Dict,
playwright_page: Optional[Page], playwright_page: Optional[Page],
cookie_dict: Dict cookie_dict: Dict,
): ):
self.proxies = proxies self.proxies = proxies
self.timeout = timeout self.timeout = timeout
@@ -45,8 +45,11 @@ class DOUYINClient(AbstractApiClient):
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
async def __process_req_params( async def __process_req_params(
self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None, self,
request_method="GET" uri: str,
params: Optional[Dict] = None,
headers: Optional[Dict] = None,
request_method="GET",
): ):
if not params: if not params:
@@ -93,10 +96,8 @@ class DOUYINClient(AbstractApiClient):
async def request(self, method, url, **kwargs): async def request(self, method, url, **kwargs):
response = None response = None
if method == "GET": async with httpx.AsyncClient(proxies=self.proxies) as client:
response = requests.request(method, url, **kwargs) response = await client.request(method, url, timeout=self.timeout, **kwargs)
elif method == "POST":
response = requests.request(method, url, **kwargs)
try: try:
if response.text == "" or response.text == "blocked": if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}") utils.logger.error(f"request params incrr, response.text: {response.text}")
@@ -132,13 +133,13 @@ class DOUYINClient(AbstractApiClient):
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
async def search_info_by_keyword( async def search_info_by_keyword(
self, self,
keyword: str, keyword: str,
offset: int = 0, offset: int = 0,
search_channel: SearchChannelType = SearchChannelType.GENERAL, search_channel: SearchChannelType = SearchChannelType.GENERAL,
sort_type: SearchSortType = SearchSortType.GENERAL, sort_type: SearchSortType = SearchSortType.GENERAL,
publish_time: PublishTimeType = PublishTimeType.UNLIMITED, publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
search_id: str = "" search_id: str = "",
): ):
""" """
DouYin Web Search API DouYin Web Search API
@@ -165,10 +166,7 @@ class DOUYINClient(AbstractApiClient):
'search_id': search_id, 'search_id': search_id,
} }
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value: if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
query_params["filter_selected"] = json.dumps({ query_params["filter_selected"] = json.dumps({"sort_type": str(sort_type.value), "publish_time": str(publish_time.value)})
"sort_type": str(sort_type.value),
"publish_time": str(publish_time.value)
})
query_params["is_filter_search"] = 1 query_params["is_filter_search"] = 1
query_params["search_source"] = "tab_search" query_params["search_source"] = "tab_search"
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general" referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
@@ -182,9 +180,7 @@ class DOUYINClient(AbstractApiClient):
:param aweme_id: :param aweme_id:
:return: :return:
""" """
params = { params = {"aweme_id": aweme_id}
"aweme_id": aweme_id
}
headers = copy.copy(self.headers) headers = copy.copy(self.headers)
del headers["Origin"] del headers["Origin"]
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers) res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
@@ -195,12 +191,7 @@ class DOUYINClient(AbstractApiClient):
""" """
uri = "/aweme/v1/web/comment/list/" uri = "/aweme/v1/web/comment/list/"
params = { params = {"aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0}
"aweme_id": aweme_id,
"cursor": cursor,
"count": 20,
"item_type": 0
}
keywords = request_keyword_var.get() keywords = request_keyword_var.get()
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general' referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
headers = copy.copy(self.headers) headers = copy.copy(self.headers)
@@ -226,12 +217,12 @@ class DOUYINClient(AbstractApiClient):
return await self.get(uri, params) return await self.get(uri, params)
async def get_aweme_all_comments( async def get_aweme_all_comments(
self, self,
aweme_id: str, aweme_id: str,
crawl_interval: float = 1.0, crawl_interval: float = 1.0,
is_fetch_sub_comments=False, is_fetch_sub_comments=False,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
max_count: int = 10, max_count: int = 10,
): ):
""" """
获取帖子的所有评论,包括子评论 获取帖子的所有评论,包括子评论
@@ -315,9 +306,17 @@ class DOUYINClient(AbstractApiClient):
posts_has_more = aweme_post_res.get("has_more", 0) posts_has_more = aweme_post_res.get("has_more", 0)
max_cursor = aweme_post_res.get("max_cursor") max_cursor = aweme_post_res.get("max_cursor")
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else [] aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
utils.logger.info( utils.logger.info(f"[DouYinCrawler.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
f"[DOUYINClient.get_all_user_aweme_posts] got sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
if callback: if callback:
await callback(aweme_list) await callback(aweme_list)
result.extend(aweme_list) result.extend(aweme_list)
return result return result
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
if not response.reason_phrase == "OK":
utils.logger.error(f"[DouYinCrawler.get_aweme_media] request {url} err, res:{response.text}")
return None
else:
return response.content

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio import asyncio
import os import os
import random import random
@@ -31,7 +30,7 @@ from tools import utils
from tools.cdp_browser import CDPBrowserManager from tools.cdp_browser import CDPBrowserManager
from var import crawler_type_var, source_keyword_var from var import crawler_type_var, source_keyword_var
from .client import DOUYINClient from .client import DouYinClient
from .exception import DataFetchError from .exception import DataFetchError
from .field import PublishTimeType from .field import PublishTimeType
from .login import DouYinLogin from .login import DouYinLogin
@@ -39,7 +38,7 @@ from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler): class DouYinCrawler(AbstractCrawler):
context_page: Page context_page: Page
dy_client: DOUYINClient dy_client: DouYinClient
browser_context: BrowserContext browser_context: BrowserContext
cdp_manager: Optional[CDPBrowserManager] cdp_manager: Optional[CDPBrowserManager]
@@ -50,13 +49,9 @@ class DouYinCrawler(AbstractCrawler):
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool( ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info( playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
ip_proxy_info
)
async with async_playwright() as playwright: async with async_playwright() as playwright:
# 根据配置选择启动模式 # 根据配置选择启动模式
@@ -93,9 +88,7 @@ class DouYinCrawler(AbstractCrawler):
cookie_str=config.COOKIES, cookie_str=config.COOKIES,
) )
await login_obj.begin() await login_obj.begin()
await self.dy_client.update_cookies( await self.dy_client.update_cookies(browser_context=self.browser_context)
browser_context=self.browser_context
)
crawler_type_var.set(config.CRAWLER_TYPE) crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
@@ -121,17 +114,13 @@ class DouYinCrawler(AbstractCrawler):
aweme_list: List[str] = [] aweme_list: List[str] = []
page = 0 page = 0
dy_search_id = "" dy_search_id = ""
while ( while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
page - start_page + 1
) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page: if page < start_page:
utils.logger.info(f"[DouYinCrawler.search] Skip {page}") utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
page += 1 page += 1
continue continue
try: try:
utils.logger.info( utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}")
f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page}"
)
posts_res = await self.dy_client.search_info_by_keyword( posts_res = await self.dy_client.search_info_by_keyword(
keyword=keyword, keyword=keyword,
offset=page * dy_limit_count - dy_limit_count, offset=page * dy_limit_count - dy_limit_count,
@@ -139,67 +128,49 @@ class DouYinCrawler(AbstractCrawler):
search_id=dy_search_id, search_id=dy_search_id,
) )
if posts_res.get("data") is None or posts_res.get("data") == []: if posts_res.get("data") is None or posts_res.get("data") == []:
utils.logger.info( utils.logger.info(f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`")
f"[DouYinCrawler.search] search douyin keyword: {keyword}, page: {page} is empty,{posts_res.get('data')}`"
)
break break
except DataFetchError: except DataFetchError:
utils.logger.error( utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
f"[DouYinCrawler.search] search douyin keyword: {keyword} failed"
)
break break
page += 1 page += 1
if "data" not in posts_res: if "data" not in posts_res:
utils.logger.error( utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed账号也许被风控了。")
f"[DouYinCrawler.search] search douyin keyword: {keyword} failed账号也许被风控了。"
)
break break
dy_search_id = posts_res.get("extra", {}).get("logid", "") dy_search_id = posts_res.get("extra", {}).get("logid", "")
for post_item in posts_res.get("data"): for post_item in posts_res.get("data"):
try: try:
aweme_info: Dict = ( aweme_info: Dict = (post_item.get("aweme_info") or post_item.get("aweme_mix_info", {}).get("mix_items")[0])
post_item.get("aweme_info")
or post_item.get("aweme_mix_info", {}).get("mix_items")[0]
)
except TypeError: except TypeError:
continue continue
aweme_list.append(aweme_info.get("aweme_id", "")) aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin_store.update_douyin_aweme(aweme_item=aweme_info) await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
utils.logger.info( await self.get_aweme_media(aweme_item=aweme_info)
f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}" utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
)
await self.batch_get_note_comments(aweme_list) await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self): async def get_specified_awemes(self):
"""Get the information and comments of the specified post""" """Get the information and comments of the specified post"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST]
self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore)
for aweme_id in config.DY_SPECIFIED_ID_LIST
]
aweme_details = await asyncio.gather(*task_list) aweme_details = await asyncio.gather(*task_list)
for aweme_detail in aweme_details: for aweme_detail in aweme_details:
if aweme_detail is not None: if aweme_detail is not None:
await douyin_store.update_douyin_aweme(aweme_detail) await douyin_store.update_douyin_aweme(aweme_item=aweme_detail)
await self.get_aweme_media(aweme_item=aweme_detail)
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST) await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
async def get_aweme_detail( async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
self, aweme_id: str, semaphore: asyncio.Semaphore
) -> Any:
"""Get note detail""" """Get note detail"""
async with semaphore: async with semaphore:
try: try:
return await self.dy_client.get_video_by_id(aweme_id) return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}"
)
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error( utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}"
)
return None return None
async def batch_get_note_comments(self, aweme_list: List[str]) -> None: async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
@@ -207,17 +178,13 @@ class DouYinCrawler(AbstractCrawler):
Batch get note comments Batch get note comments
""" """
if not config.ENABLE_GET_COMMENTS: if not config.ENABLE_GET_COMMENTS:
utils.logger.info( utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled"
)
return return
task_list: List[Task] = [] task_list: List[Task] = []
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list: for aweme_id in aweme_list:
task = asyncio.create_task( task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
self.get_comments(aweme_id, semaphore), name=aweme_id
)
task_list.append(task) task_list.append(task)
if len(task_list) > 0: if len(task_list) > 0:
await asyncio.wait(task_list) await asyncio.wait(task_list)
@@ -233,30 +200,22 @@ class DouYinCrawler(AbstractCrawler):
callback=douyin_store.batch_update_dy_aweme_comments, callback=douyin_store.batch_update_dy_aweme_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )
utils.logger.info( utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ..."
)
except DataFetchError as e: except DataFetchError as e:
utils.logger.error( utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}"
)
async def get_creators_and_videos(self) -> None: async def get_creators_and_videos(self) -> None:
""" """
Get the information and videos of the specified creator Get the information and videos of the specified creator
""" """
utils.logger.info( utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
"[DouYinCrawler.get_creators_and_videos] Begin get douyin creators"
)
for user_id in config.DY_CREATOR_ID_LIST: for user_id in config.DY_CREATOR_ID_LIST:
creator_info: Dict = await self.dy_client.get_user_info(user_id) creator_info: Dict = await self.dy_client.get_user_info(user_id)
if creator_info: if creator_info:
await douyin_store.save_creator(user_id, creator=creator_info) await douyin_store.save_creator(user_id, creator=creator_info)
# Get all video information of the creator # Get all video information of the creator
all_video_list = await self.dy_client.get_all_user_aweme_posts( all_video_list = await self.dy_client.get_all_user_aweme_posts(sec_user_id=user_id, callback=self.fetch_creator_video_detail)
sec_user_id=user_id, callback=self.fetch_creator_video_detail
)
video_ids = [video_item.get("aweme_id") for video_item in all_video_list] video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
await self.batch_get_note_comments(video_ids) await self.batch_get_note_comments(video_ids)
@@ -266,25 +225,21 @@ class DouYinCrawler(AbstractCrawler):
Concurrently obtain the specified post list and save the data Concurrently obtain the specified post list and save the data
""" """
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list]
self.get_aweme_detail(post_item.get("aweme_id"), semaphore)
for post_item in video_list
]
note_details = await asyncio.gather(*task_list) note_details = await asyncio.gather(*task_list)
for aweme_item in note_details: for aweme_item in note_details:
if aweme_item is not None: if aweme_item is not None:
await douyin_store.update_douyin_aweme(aweme_item) await douyin_store.update_douyin_aweme(aweme_item=aweme_item)
await self.get_aweme_media(aweme_item=aweme_item)
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient: async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient:
"""Create douyin client""" """Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
douyin_client = DOUYINClient( douyin_client = DouYinClient(
proxies=httpx_proxy, proxies=httpx_proxy,
headers={ headers={
"User-Agent": await self.context_page.evaluate( "User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
"() => navigator.userAgent"
),
"Cookie": cookie_str, "Cookie": cookie_str,
"Host": "www.douyin.com", "Host": "www.douyin.com",
"Origin": "https://www.douyin.com/", "Origin": "https://www.douyin.com/",
@@ -305,23 +260,22 @@ class DouYinCrawler(AbstractCrawler):
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join( user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,
headless=headless, headless=headless,
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent, user_agent=user_agent,
) # type: ignore ) # type: ignore
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context( browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
)
return browser_context return browser_context
async def launch_browser_with_cdp( async def launch_browser_with_cdp(
@@ -356,9 +310,7 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.error(f"[DouYinCrawler] CDP模式启动失败回退到标准模式: {e}") utils.logger.error(f"[DouYinCrawler] CDP模式启动失败回退到标准模式: {e}")
# 回退到标准模式 # 回退到标准模式
chromium = playwright.chromium chromium = playwright.chromium
return await self.launch_browser( return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
chromium, playwright_proxy, user_agent, headless
)
async def close(self) -> None: async def close(self) -> None:
"""Close browser context""" """Close browser context"""
@@ -369,3 +321,73 @@ class DouYinCrawler(AbstractCrawler):
else: else:
await self.browser_context.close() await self.browser_context.close()
utils.logger.info("[DouYinCrawler.close] Browser context closed ...") utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
async def get_aweme_media(self, aweme_item: Dict):
"""
获取抖音媒体,自动判断媒体类型是短视频还是帖子图片并下载
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
utils.logger.info(f"[DouYinCrawler.get_aweme_media] Crawling image mode is not enabled")
return
# 笔记 urls 列表,若为短视频类型则返回为空列表
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
# 视频 url永远存在但为短视频类型时的文件其实是音频文件
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
# TODO: 抖音并没采用音视频分离的策略,故音频可从原视频中分离,暂不提取
if note_download_url:
await self.get_aweme_images(aweme_item)
else:
await self.get_aweme_video(aweme_item)
async def get_aweme_images(self, aweme_item: Dict):
"""
get aweme images. please use get_aweme_media
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
return
aweme_id = aweme_item.get("aweme_id")
# 笔记 urls 列表,若为短视频类型则返回为空列表
note_download_url: List[str] = douyin_store._extract_note_image_list(aweme_item)
if not note_download_url:
return
picNum = 0
for url in note_download_url:
if not url:
continue
content = await self.dy_client.get_aweme_media(url)
if content is None:
continue
extension_file_name = f"{picNum}.jpeg"
picNum += 1
await douyin_store.update_dy_aweme_image(aweme_id, content, extension_file_name)
async def get_aweme_video(self, aweme_item: Dict):
"""
get aweme videos. please use get_aweme_media
Args:
aweme_item (Dict): 抖音作品详情
"""
if not config.ENABLE_GET_MEIDAS:
return
aweme_id = aweme_item.get("aweme_id")
# 视频 url永远存在但为短视频类型时的文件其实是音频文件
video_download_url: str = douyin_store._extract_video_download_url(aweme_item)
if not video_download_url:
return
videoNum = 0
content = await self.dy_client.get_aweme_media(video_download_url)
if content is None:
return
extension_file_name = f"{videoNum}.mp4"
videoNum += 1
await douyin_store.update_dy_aweme_video(aweme_id, content, extension_file_name)

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:40 # @Time : 2023/12/23 15:40
@@ -33,14 +32,15 @@ from .field import SearchType
class WeiboClient: class WeiboClient:
def __init__( def __init__(
self, self,
timeout=10, timeout=30, # 若开启爬取媒体选项weibo 的图片需要更久的超时时间
proxies=None, proxies=None,
*, *,
headers: Dict[str, str], headers: Dict[str, str],
playwright_page: Page, playwright_page: Page,
cookie_dict: Dict[str, str], cookie_dict: Dict[str, str],
): ):
self.proxies = proxies self.proxies = proxies
self.timeout = timeout self.timeout = timeout
@@ -53,10 +53,7 @@ class WeiboClient:
async def request(self, method, url, **kwargs) -> Union[Response, Dict]: async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
enable_return_response = kwargs.pop("return_response", False) enable_return_response = kwargs.pop("return_response", False)
async with httpx.AsyncClient(proxies=self.proxies) as client: async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request( response = await client.request(method, url, timeout=self.timeout, **kwargs)
method, url, timeout=self.timeout,
**kwargs
)
if enable_return_response: if enable_return_response:
return response return response
@@ -84,8 +81,7 @@ class WeiboClient:
async def post(self, uri: str, data: dict) -> Dict: async def post(self, uri: str, data: dict) -> Dict:
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}", return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers)
data=json_str, headers=self.headers)
async def pong(self) -> bool: async def pong(self) -> bool:
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
@@ -109,10 +105,10 @@ class WeiboClient:
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
async def get_note_by_keyword( async def get_note_by_keyword(
self, self,
keyword: str, keyword: str,
page: int = 1, page: int = 1,
search_type: SearchType = SearchType.DEFAULT search_type: SearchType = SearchType.DEFAULT,
) -> Dict: ) -> Dict:
""" """
search note by keyword search note by keyword
@@ -187,8 +183,11 @@ class WeiboClient:
return result return result
@staticmethod @staticmethod
async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict], async def get_comments_all_sub_comments(
callback: Optional[Callable] = None) -> List[Dict]: note_id: str,
comment_list: List[Dict],
callback: Optional[Callable] = None,
) -> List[Dict]:
""" """
获取评论的所有子评论 获取评论的所有子评论
Args: Args:
@@ -200,8 +199,7 @@ class WeiboClient:
""" """
if not config.ENABLE_GET_SUB_COMMENTS: if not config.ENABLE_GET_SUB_COMMENTS:
utils.logger.info( utils.logger.info(f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
return [] return []
res_sub_comments = [] res_sub_comments = []
@@ -220,9 +218,7 @@ class WeiboClient:
""" """
url = f"{self._host}/detail/{note_id}" url = f"{self._host}/detail/{note_id}"
async with httpx.AsyncClient(proxies=self.proxies) as client: async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request( response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
"GET", url, timeout=self.timeout, headers=self.headers
)
if response.status_code != 200: if response.status_code != 200:
raise DataFetchError(f"get weibo detail err: {response.text}") raise DataFetchError(f"get weibo detail err: {response.text}")
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL) match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
@@ -230,9 +226,7 @@ class WeiboClient:
render_data_json = match.group(1) render_data_json = match.group(1)
render_data_dict = json.loads(render_data_json) render_data_dict = json.loads(render_data_json)
note_detail = render_data_dict[0].get("status") note_detail = render_data_dict[0].get("status")
note_item = { note_item = {"mblog": note_detail}
"mblog": note_detail
}
return note_item return note_item
else: else:
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值") utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
@@ -251,7 +245,8 @@ class WeiboClient:
image_url += sub_url[i] + "/" image_url += sub_url[i] + "/"
# 微博图床对外存在防盗链,所以需要代理访问 # 微博图床对外存在防盗链,所以需要代理访问
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下 # 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
final_uri = (f"{self._image_agent_host}" f"{image_url}") final_uri = (f"{self._image_agent_host}"
f"{image_url}")
async with httpx.AsyncClient(proxies=self.proxies) as client: async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request("GET", final_uri, timeout=self.timeout) response = await client.request("GET", final_uri, timeout=self.timeout)
if not response.reason_phrase == "OK": if not response.reason_phrase == "OK":
@@ -260,8 +255,6 @@ class WeiboClient:
else: else:
return response.content return response.content
async def get_creator_container_info(self, creator_id: str) -> Dict: async def get_creator_container_info(self, creator_id: str) -> Dict:
""" """
获取用户的容器ID, 容器信息代表着真实请求的API路径 获取用户的容器ID, 容器信息代表着真实请求的API路径
@@ -278,10 +271,7 @@ class WeiboClient:
if not m_weibocn_params: if not m_weibocn_params:
raise DataFetchError("get containerid failed") raise DataFetchError("get containerid failed")
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params)) m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
return { return {"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0], "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]}
"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0],
"lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]
}
async def get_creator_info_by_id(self, creator_id: str) -> Dict: async def get_creator_info_by_id(self, creator_id: str) -> Dict:
""" """
@@ -316,7 +306,12 @@ class WeiboClient:
user_res.update(container_info) user_res.update(container_info)
return user_res return user_res
async def get_notes_by_creator(self, creator: str, container_id: str, since_id: str = "0", ) -> Dict: async def get_notes_by_creator(
self,
creator: str,
container_id: str,
since_id: str = "0",
) -> Dict:
""" """
获取博主的笔记 获取博主的笔记
Args: Args:
@@ -337,8 +332,13 @@ class WeiboClient:
} }
return await self.get(uri, params) return await self.get(uri, params)
async def get_all_notes_by_creator_id(self, creator_id: str, container_id: str, crawl_interval: float = 1.0, async def get_all_notes_by_creator_id(
callback: Optional[Callable] = None) -> List[Dict]: self,
creator_id: str,
container_id: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
) -> List[Dict]:
""" """
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
Args: Args:
@@ -357,19 +357,16 @@ class WeiboClient:
while notes_has_more: while notes_has_more:
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id) notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
if not notes_res: if not notes_res:
utils.logger.error( utils.logger.error(f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
break break
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0") since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
if "cards" not in notes_res: if "cards" not in notes_res:
utils.logger.info( utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
break break
notes = notes_res["cards"] notes = notes_res["cards"]
utils.logger.info( utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}") notes = [note for note in notes if note.get("card_type") == 9]
notes = [note for note in notes if note.get("card_type") == 9]
if callback: if callback:
await callback(notes) await callback(notes)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
@@ -377,4 +374,3 @@ class WeiboClient:
crawler_total_count += 10 crawler_total_count += 10
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
return result return result

View File

@@ -8,13 +8,11 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2023/12/23 15:41 # @Time : 2023/12/23 15:41
# @Desc : 微博爬虫主流程代码 # @Desc : 微博爬虫主流程代码
import asyncio import asyncio
import os import os
import random import random
@@ -60,13 +58,9 @@ class WeiboCrawler(AbstractCrawler):
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool( ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info( playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
ip_proxy_info
)
async with async_playwright() as playwright: async with async_playwright() as playwright:
# 根据配置选择启动模式 # 根据配置选择启动模式
@@ -82,9 +76,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器") utils.logger.info("[WeiboCrawler] 使用标准模式启动浏览器")
# Launch a browser context. # Launch a browser context.
chromium = playwright.chromium chromium = playwright.chromium
self.browser_context = await self.launch_browser( self.browser_context = await self.launch_browser(chromium, None, self.mobile_user_agent, headless=config.HEADLESS)
chromium, None, self.mobile_user_agent, headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
@@ -103,14 +95,10 @@ class WeiboCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
# 登录成功后重定向到手机端的网站再更新手机端登录成功的cookie # 登录成功后重定向到手机端的网站再更新手机端登录成功的cookie
utils.logger.info( utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform")
"[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform"
)
await self.context_page.goto(self.mobile_index_url) await self.context_page.goto(self.mobile_index_url)
await asyncio.sleep(2) await asyncio.sleep(2)
await self.wb_client.update_cookies( await self.wb_client.update_cookies(browser_context=self.browser_context)
browser_context=self.browser_context
)
crawler_type_var.set(config.CRAWLER_TYPE) crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
@@ -147,30 +135,20 @@ class WeiboCrawler(AbstractCrawler):
elif config.WEIBO_SEARCH_TYPE == "video": elif config.WEIBO_SEARCH_TYPE == "video":
search_type = SearchType.VIDEO search_type = SearchType.VIDEO
else: else:
utils.logger.error( utils.logger.error(f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}")
f"[WeiboCrawler.search] Invalid WEIBO_SEARCH_TYPE: {config.WEIBO_SEARCH_TYPE}"
)
return return
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword) source_keyword_var.set(keyword)
utils.logger.info( utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
f"[WeiboCrawler.search] Current search keyword: {keyword}"
)
page = 1 page = 1
while ( while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
page - start_page + 1
) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page: if page < start_page:
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}") utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
page += 1 page += 1
continue continue
utils.logger.info( utils.logger.info(f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}")
f"[WeiboCrawler.search] search weibo keyword: {keyword}, page: {page}" search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
)
search_res = await self.wb_client.get_note_by_keyword(
keyword=keyword, page=page, search_type=search_type
)
note_id_list: List[str] = [] note_id_list: List[str] = []
note_list = filter_search_result_card(search_res.get("cards")) note_list = filter_search_result_card(search_res.get("cards"))
for note_item in note_list: for note_item in note_list:
@@ -190,19 +168,14 @@ class WeiboCrawler(AbstractCrawler):
:return: :return:
""" """
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [self.get_note_info_task(note_id=note_id, semaphore=semaphore) for note_id in config.WEIBO_SPECIFIED_ID_LIST]
self.get_note_info_task(note_id=note_id, semaphore=semaphore)
for note_id in config.WEIBO_SPECIFIED_ID_LIST
]
video_details = await asyncio.gather(*task_list) video_details = await asyncio.gather(*task_list)
for note_item in video_details: for note_item in video_details:
if note_item: if note_item:
await weibo_store.update_weibo_note(note_item) await weibo_store.update_weibo_note(note_item)
await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST) await self.batch_get_notes_comments(config.WEIBO_SPECIFIED_ID_LIST)
async def get_note_info_task( async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
self, note_id: str, semaphore: asyncio.Semaphore
) -> Optional[Dict]:
""" """
Get note detail task Get note detail task
:param note_id: :param note_id:
@@ -214,14 +187,10 @@ class WeiboCrawler(AbstractCrawler):
result = await self.wb_client.get_note_info_by_id(note_id) result = await self.wb_client.get_note_info_by_id(note_id)
return result return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}"
)
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error( utils.logger.error(f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}")
f"[WeiboCrawler.get_note_info_task] have not fund note detail note_id:{note_id}, err: {ex}"
)
return None return None
async def batch_get_notes_comments(self, note_id_list: List[str]): async def batch_get_notes_comments(self, note_id_list: List[str]):
@@ -231,20 +200,14 @@ class WeiboCrawler(AbstractCrawler):
:return: :return:
""" """
if not config.ENABLE_GET_COMMENTS: if not config.ENABLE_GET_COMMENTS:
utils.logger.info( utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled"
)
return return
utils.logger.info( utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for note_id in note_id_list: for note_id in note_id_list:
task = asyncio.create_task( task = asyncio.create_task(self.get_note_comments(note_id, semaphore), name=note_id)
self.get_note_comments(note_id, semaphore), name=note_id
)
task_list.append(task) task_list.append(task)
await asyncio.gather(*task_list) await asyncio.gather(*task_list)
@@ -257,25 +220,17 @@ class WeiboCrawler(AbstractCrawler):
""" """
async with semaphore: async with semaphore:
try: try:
utils.logger.info( utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ..."
)
await self.wb_client.get_note_all_comments( await self.wb_client.get_note_all_comments(
note_id=note_id, note_id=note_id,
crawl_interval=random.randint( crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重所以延时提高一些
1, 3
), # 微博对API的限流比较严重所以延时提高一些
callback=weibo_store.batch_update_weibo_note_comments, callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}"
)
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}"
)
async def get_note_images(self, mblog: Dict): async def get_note_images(self, mblog: Dict):
""" """
@@ -283,10 +238,8 @@ class WeiboCrawler(AbstractCrawler):
:param mblog: :param mblog:
:return: :return:
""" """
if not config.ENABLE_GET_IMAGES: if not config.ENABLE_GET_MEIDAS:
utils.logger.info( utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled"
)
return return
pics: Dict = mblog.get("pics") pics: Dict = mblog.get("pics")
@@ -299,9 +252,7 @@ class WeiboCrawler(AbstractCrawler):
content = await self.wb_client.get_note_image(url) content = await self.wb_client.get_note_image(url)
if content != None: if content != None:
extension_file_name = url.split(".")[-1] extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image( await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
pic["pid"], content, extension_file_name
)
async def get_creators_and_notes(self) -> None: async def get_creators_and_notes(self) -> None:
""" """
@@ -309,18 +260,12 @@ class WeiboCrawler(AbstractCrawler):
Returns: Returns:
""" """
utils.logger.info( utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
)
for user_id in config.WEIBO_CREATOR_ID_LIST: for user_id in config.WEIBO_CREATOR_ID_LIST:
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id( createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
creator_id=user_id
)
if createor_info_res: if createor_info_res:
createor_info: Dict = createor_info_res.get("userInfo", {}) createor_info: Dict = createor_info_res.get("userInfo", {})
utils.logger.info( utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}"
)
if not createor_info: if not createor_info:
raise DataFetchError("Get creator info error") raise DataFetchError("Get creator info error")
await weibo_store.save_creator(user_id, user_info=createor_info) await weibo_store.save_creator(user_id, user_info=createor_info)
@@ -333,26 +278,16 @@ class WeiboCrawler(AbstractCrawler):
callback=weibo_store.batch_update_weibo_notes, callback=weibo_store.batch_update_weibo_notes,
) )
note_ids = [ note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
note_item.get("mblog", {}).get("id")
for note_item in all_notes_list
if note_item.get("mblog", {}).get("id")
]
await self.batch_get_notes_comments(note_ids) await self.batch_get_notes_comments(note_ids)
else: else:
utils.logger.error( utils.logger.error(f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}"
)
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient: async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info( utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
"[WeiboCrawler.create_weibo_client] Begin create weibo API client ..." cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
)
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
weibo_client_obj = WeiboClient( weibo_client_obj = WeiboClient(
proxies=httpx_proxy, proxies=httpx_proxy,
headers={ headers={
@@ -375,27 +310,24 @@ class WeiboCrawler(AbstractCrawler):
headless: bool = True, headless: bool = True,
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """Launch browser and create browser context"""
utils.logger.info( utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
"[WeiboCrawler.launch_browser] Begin create browser context ..."
)
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join( user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,
headless=headless, headless=headless,
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent, user_agent=user_agent,
) )
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context( browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
)
return browser_context return browser_context
async def launch_browser_with_cdp( async def launch_browser_with_cdp(
@@ -427,9 +359,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.error(f"[WeiboCrawler] CDP模式启动失败回退到标准模式: {e}") utils.logger.error(f"[WeiboCrawler] CDP模式启动失败回退到标准模式: {e}")
# 回退到标准模式 # 回退到标准模式
chromium = playwright.chromium chromium = playwright.chromium
return await self.launch_browser( return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
chromium, playwright_proxy, user_agent, headless
)
async def close(self): async def close(self):
"""Close browser context""" """Close browser context"""

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio import asyncio
import json import json
import re import re
@@ -30,9 +29,10 @@ from .help import get_search_id, sign
class XiaoHongShuClient(AbstractApiClient): class XiaoHongShuClient(AbstractApiClient):
def __init__( def __init__(
self, self,
timeout=10, timeout=30, # 若开启爬取媒体选项xhs 的长视频需要更久的超时时间
proxies=None, proxies=None,
*, *,
headers: Dict[str, str], headers: Dict[str, str],
@@ -61,9 +61,7 @@ class XiaoHongShuClient(AbstractApiClient):
Returns: Returns:
""" """
encrypt_params = await self.playwright_page.evaluate( encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data])
"([url, data]) => window._webmsxyw(url,data)", [url, data]
)
local_storage = await self.playwright_page.evaluate("() => window.localStorage") local_storage = await self.playwright_page.evaluate("() => window.localStorage")
signs = sign( signs = sign(
a1=self.cookie_dict.get("a1", ""), a1=self.cookie_dict.get("a1", ""),
@@ -130,9 +128,7 @@ class XiaoHongShuClient(AbstractApiClient):
if isinstance(params, dict): if isinstance(params, dict):
final_uri = f"{uri}?" f"{urlencode(params)}" final_uri = f"{uri}?" f"{urlencode(params)}"
headers = await self._pre_headers(final_uri) headers = await self._pre_headers(final_uri)
return await self.request( return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
method="GET", url=f"{self._host}{final_uri}", headers=headers
)
async def post(self, uri: str, data: dict, **kwargs) -> Dict: async def post(self, uri: str, data: dict, **kwargs) -> Dict:
""" """
@@ -158,9 +154,7 @@ class XiaoHongShuClient(AbstractApiClient):
async with httpx.AsyncClient(proxies=self.proxies) as client: async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request("GET", url, timeout=self.timeout) response = await client.request("GET", url, timeout=self.timeout)
if not response.reason_phrase == "OK": if not response.reason_phrase == "OK":
utils.logger.error( utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}"
)
return None return None
else: else:
return response.content return response.content
@@ -179,9 +173,7 @@ class XiaoHongShuClient(AbstractApiClient):
if note_card.get("items"): if note_card.get("items"):
ping_flag = True ping_flag = True
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again..."
)
ping_flag = False ping_flag = False
return ping_flag return ping_flag
@@ -231,7 +223,10 @@ class XiaoHongShuClient(AbstractApiClient):
return await self.post(uri, data) return await self.post(uri, data)
async def get_note_by_id( async def get_note_by_id(
self, note_id: str, xsec_source: str, xsec_token: str self,
note_id: str,
xsec_source: str,
xsec_token: str,
) -> Dict: ) -> Dict:
""" """
获取笔记详情API 获取笔记详情API
@@ -249,7 +244,9 @@ class XiaoHongShuClient(AbstractApiClient):
data = { data = {
"source_note_id": note_id, "source_note_id": note_id,
"image_formats": ["jpg", "webp", "avif"], "image_formats": ["jpg", "webp", "avif"],
"extra": {"need_body_topic": 1}, "extra": {
"need_body_topic": 1
},
"xsec_source": xsec_source, "xsec_source": xsec_source,
"xsec_token": xsec_token, "xsec_token": xsec_token,
} }
@@ -259,13 +256,14 @@ class XiaoHongShuClient(AbstractApiClient):
res_dict: Dict = res["items"][0]["note_card"] res_dict: Dict = res["items"][0]["note_card"]
return res_dict return res_dict
# 爬取频繁了可能会出现有的笔记能有结果有的没有 # 爬取频繁了可能会出现有的笔记能有结果有的没有
utils.logger.error( utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}"
)
return dict() return dict()
async def get_note_comments( async def get_note_comments(
self, note_id: str, xsec_token: str, cursor: str = "" self,
note_id: str,
xsec_token: str,
cursor: str = "",
) -> Dict: ) -> Dict:
""" """
获取一级评论的API 获取一级评论的API
@@ -342,19 +340,15 @@ class XiaoHongShuClient(AbstractApiClient):
comments_has_more = True comments_has_more = True
comments_cursor = "" comments_cursor = ""
while comments_has_more and len(result) < max_count: while comments_has_more and len(result) < max_count:
comments_res = await self.get_note_comments( comments_res = await self.get_note_comments(note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor)
note_id=note_id, xsec_token=xsec_token, cursor=comments_cursor
)
comments_has_more = comments_res.get("has_more", False) comments_has_more = comments_res.get("has_more", False)
comments_cursor = comments_res.get("cursor", "") comments_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res: if "comments" not in comments_res:
utils.logger.info( utils.logger.info(f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}"
)
break break
comments = comments_res["comments"] comments = comments_res["comments"]
if len(result) + len(comments) > max_count: if len(result) + len(comments) > max_count:
comments = comments[: max_count - len(result)] comments = comments[:max_count - len(result)]
if callback: if callback:
await callback(note_id, comments) await callback(note_id, comments)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
@@ -387,9 +381,7 @@ class XiaoHongShuClient(AbstractApiClient):
""" """
if not config.ENABLE_GET_SUB_COMMENTS: if not config.ENABLE_GET_SUB_COMMENTS:
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
)
return [] return []
result = [] result = []
@@ -416,16 +408,12 @@ class XiaoHongShuClient(AbstractApiClient):
) )
if comments_res is None: if comments_res is None:
utils.logger.info( utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}")
f"[XiaoHongShuClient.get_comments_all_sub_comments] No response found for note_id: {note_id}"
)
continue continue
sub_comment_has_more = comments_res.get("has_more", False) sub_comment_has_more = comments_res.get("has_more", False)
sub_comment_cursor = comments_res.get("cursor", "") sub_comment_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res: if "comments" not in comments_res:
utils.logger.info( utils.logger.info(f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}")
f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}"
)
break break
comments = comments_res["comments"] comments = comments_res["comments"]
if callback: if callback:
@@ -441,12 +429,8 @@ class XiaoHongShuClient(AbstractApiClient):
eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217 eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
""" """
uri = f"/user/profile/{user_id}" uri = f"/user/profile/{user_id}"
html_content = await self.request( html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers)
"GET", self._domain + uri, return_response=True, headers=self.headers match = re.search(r"<script>window.__INITIAL_STATE__=(.+)<\/script>", html_content, re.M)
)
match = re.search(
r"<script>window.__INITIAL_STATE__=(.+)<\/script>", html_content, re.M
)
if match is None: if match is None:
return {} return {}
@@ -457,7 +441,10 @@ class XiaoHongShuClient(AbstractApiClient):
return info.get("user").get("userPageData") return info.get("user").get("userPageData")
async def get_notes_by_creator( async def get_notes_by_creator(
self, creator: str, cursor: str, page_size: int = 30 self,
creator: str,
cursor: str,
page_size: int = 30,
) -> Dict: ) -> Dict:
""" """
获取博主的笔记 获取博主的笔记
@@ -500,23 +487,17 @@ class XiaoHongShuClient(AbstractApiClient):
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
notes_res = await self.get_notes_by_creator(user_id, notes_cursor) notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
if not notes_res: if not notes_res:
utils.logger.error( utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
)
break break
notes_has_more = notes_res.get("has_more", False) notes_has_more = notes_res.get("has_more", False)
notes_cursor = notes_res.get("cursor", "") notes_cursor = notes_res.get("cursor", "")
if "notes" not in notes_res: if "notes" not in notes_res:
utils.logger.info( utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}"
)
break break
notes = notes_res["notes"] notes = notes_res["notes"]
utils.logger.info( utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}"
)
remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result) remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result)
if remaining <= 0: if remaining <= 0:
@@ -529,9 +510,7 @@ class XiaoHongShuClient(AbstractApiClient):
result.extend(notes_to_add) result.extend(notes_to_add)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
utils.logger.info( utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}")
f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}"
)
return result return result
async def get_note_short_url(self, note_id: str) -> Dict: async def get_note_short_url(self, note_id: str) -> Dict:
@@ -582,35 +561,20 @@ class XiaoHongShuClient(AbstractApiClient):
elif isinstance(value, dict): elif isinstance(value, dict):
dict_new[new_key] = transform_json_keys(json.dumps(value)) dict_new[new_key] = transform_json_keys(json.dumps(value))
elif isinstance(value, list): elif isinstance(value, list):
dict_new[new_key] = [ dict_new[new_key] = [(transform_json_keys(json.dumps(item)) if (item and isinstance(item, dict)) else item) for item in value]
(
transform_json_keys(json.dumps(item))
if (item and isinstance(item, dict))
else item
)
for item in value
]
else: else:
dict_new[new_key] = value dict_new[new_key] = value
return dict_new return dict_new
url = ( url = ("https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}")
"https://www.xiaohongshu.com/explore/"
+ note_id
+ f"?xsec_token={xsec_token}&xsec_source={xsec_source}"
)
copy_headers = self.headers.copy() copy_headers = self.headers.copy()
if not enable_cookie: if not enable_cookie:
del copy_headers["Cookie"] del copy_headers["Cookie"]
html = await self.request( html = await self.request(method="GET", url=url, return_response=True, headers=copy_headers)
method="GET", url=url, return_response=True, headers=copy_headers
)
def get_note_dict(html): def get_note_dict(html):
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[ state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
0
].replace("undefined", '""')
if state != "{}": if state != "{}":
note_dict = transform_json_keys(state) note_dict = transform_json_keys(state)

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio import asyncio
import os import os
import random import random
@@ -57,13 +56,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
ip_proxy_pool = await create_ip_pool( ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info( playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
ip_proxy_info
)
async with async_playwright() as playwright: async with async_playwright() as playwright:
# 根据配置选择启动模式 # 根据配置选择启动模式
@@ -101,9 +96,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
cookie_str=config.COOKIES, cookie_str=config.COOKIES,
) )
await login_obj.begin() await login_obj.begin()
await self.xhs_client.update_cookies( await self.xhs_client.update_cookies(browser_context=self.browser_context)
browser_context=self.browser_context
)
crawler_type_var.set(config.CRAWLER_TYPE) crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search": if config.CRAWLER_TYPE == "search":
@@ -122,47 +115,33 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def search(self) -> None: async def search(self) -> None:
"""Search for notes and retrieve their comment information.""" """Search for notes and retrieve their comment information."""
utils.logger.info( utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
"[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords"
)
xhs_limit_count = 20 # xhs limit page fixed value xhs_limit_count = 20 # xhs limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
start_page = config.START_PAGE start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword) source_keyword_var.set(keyword)
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}"
)
page = 1 page = 1
search_id = get_search_id() search_id = get_search_id()
while ( while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
page - start_page + 1
) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page: if page < start_page:
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}") utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
page += 1 page += 1
continue continue
try: try:
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}")
f"[XiaoHongShuCrawler.search] search xhs keyword: {keyword}, page: {page}"
)
note_ids: List[str] = [] note_ids: List[str] = []
xsec_tokens: List[str] = [] xsec_tokens: List[str] = []
notes_res = await self.xhs_client.get_note_by_keyword( notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword, keyword=keyword,
search_id=search_id, search_id=search_id,
page=page, page=page,
sort=( sort=(SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != "" else SearchSortType.GENERAL),
SearchSortType(config.SORT_TYPE)
if config.SORT_TYPE != ""
else SearchSortType.GENERAL
),
)
utils.logger.info(
f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}"
) )
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
if not notes_res or not notes_res.get("has_more", False): if not notes_res or not notes_res.get("has_more", False):
utils.logger.info("No more content!") utils.logger.info("No more content!")
break break
@@ -173,9 +152,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
xsec_source=post_item.get("xsec_source"), xsec_source=post_item.get("xsec_source"),
xsec_token=post_item.get("xsec_token"), xsec_token=post_item.get("xsec_token"),
semaphore=semaphore, semaphore=semaphore,
) ) for post_item in notes_res.get("items", {}) if post_item.get("model_type") not in ("rec_query", "hot_query")
for post_item in notes_res.get("items", {})
if post_item.get("model_type") not in ("rec_query", "hot_query")
] ]
note_details = await asyncio.gather(*task_list) note_details = await asyncio.gather(*task_list)
for note_detail in note_details: for note_detail in note_details:
@@ -185,26 +162,18 @@ class XiaoHongShuCrawler(AbstractCrawler):
note_ids.append(note_detail.get("note_id")) note_ids.append(note_detail.get("note_id"))
xsec_tokens.append(note_detail.get("xsec_token")) xsec_tokens.append(note_detail.get("xsec_token"))
page += 1 page += 1
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
f"[XiaoHongShuCrawler.search] Note details: {note_details}"
)
await self.batch_get_note_comments(note_ids, xsec_tokens) await self.batch_get_note_comments(note_ids, xsec_tokens)
except DataFetchError: except DataFetchError:
utils.logger.error( utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
"[XiaoHongShuCrawler.search] Get note detail error"
)
break break
async def get_creators_and_notes(self) -> None: async def get_creators_and_notes(self) -> None:
"""Get creator's notes and retrieve their comment information.""" """Get creator's notes and retrieve their comment information."""
utils.logger.info( utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
"[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators"
)
for user_id in config.XHS_CREATOR_ID_LIST: for user_id in config.XHS_CREATOR_ID_LIST:
# get creator detail info from web html content # get creator detail info from web html content
createor_info: Dict = await self.xhs_client.get_creator_info( createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
user_id=user_id
)
if createor_info: if createor_info:
await xhs_store.save_creator(user_id, creator=createor_info) await xhs_store.save_creator(user_id, creator=createor_info)
@@ -238,14 +207,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
xsec_source=post_item.get("xsec_source"), xsec_source=post_item.get("xsec_source"),
xsec_token=post_item.get("xsec_token"), xsec_token=post_item.get("xsec_token"),
semaphore=semaphore, semaphore=semaphore,
) ) for post_item in note_list
for post_item in note_list
] ]
note_details = await asyncio.gather(*task_list) note_details = await asyncio.gather(*task_list)
for note_detail in note_details: for note_detail in note_details:
if note_detail: if note_detail:
await xhs_store.update_xhs_note(note_detail) await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
async def get_specified_notes(self): async def get_specified_notes(self):
""" """
@@ -257,9 +226,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
get_note_detail_task_list = [] get_note_detail_task_list = []
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST: for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url) note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}"
)
crawler_task = self.get_note_detail_async_task( crawler_task = self.get_note_detail_async_task(
note_id=note_url_info.note_id, note_id=note_url_info.note_id,
xsec_source=note_url_info.xsec_source, xsec_source=note_url_info.xsec_source,
@@ -276,14 +243,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
need_get_comment_note_ids.append(note_detail.get("note_id", "")) need_get_comment_note_ids.append(note_detail.get("note_id", ""))
xsec_tokens.append(note_detail.get("xsec_token", "")) xsec_tokens.append(note_detail.get("xsec_token", ""))
await xhs_store.update_xhs_note(note_detail) await xhs_store.update_xhs_note(note_detail)
await self.get_notice_media(note_detail)
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens) await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
async def get_note_detail_async_task( async def get_note_detail_async_task(
self, self,
note_id: str, note_id: str,
xsec_source: str, xsec_source: str,
xsec_token: str, xsec_token: str,
semaphore: asyncio.Semaphore, semaphore: asyncio.Semaphore,
) -> Optional[Dict]: ) -> Optional[Dict]:
"""Get note detail """Get note detail
@@ -299,72 +267,49 @@ class XiaoHongShuCrawler(AbstractCrawler):
note_detail = None note_detail = None
async with semaphore: async with semaphore:
try: try:
utils.logger.info( utils.logger.info(f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}")
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
)
try: try:
note_detail = await self.xhs_client.get_note_by_id( note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
note_id, xsec_source, xsec_token
)
except RetryError as e: except RetryError as e:
pass pass
if not note_detail: if not note_detail:
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token, enable_cookie=False)
enable_cookie=False)
if not note_detail: if not note_detail:
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}") raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
note_detail.update( note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source})
{"xsec_token": xsec_token, "xsec_source": xsec_source}
)
return note_detail return note_detail
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error( utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}")
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
)
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error( utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}")
f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}"
)
return None return None
async def batch_get_note_comments( async def batch_get_note_comments(self, note_list: List[str], xsec_tokens: List[str]):
self, note_list: List[str], xsec_tokens: List[str]
):
"""Batch get note comments""" """Batch get note comments"""
if not config.ENABLE_GET_COMMENTS: if not config.ENABLE_GET_COMMENTS:
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled"
)
return return
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}"
)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for index, note_id in enumerate(note_list): for index, note_id in enumerate(note_list):
task = asyncio.create_task( task = asyncio.create_task(
self.get_comments( self.get_comments(note_id=note_id, xsec_token=xsec_tokens[index], semaphore=semaphore),
note_id=note_id, xsec_token=xsec_tokens[index], semaphore=semaphore
),
name=note_id, name=note_id,
) )
task_list.append(task) task_list.append(task)
await asyncio.gather(*task_list) await asyncio.gather(*task_list)
async def get_comments( async def get_comments(self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore):
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
):
"""Get note comments with keyword filtering and quantity limitation""" """Get note comments with keyword filtering and quantity limitation"""
async with semaphore: async with semaphore:
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}"
)
# When proxy is not enabled, increase the crawling interval # When proxy is not enabled, increase the crawling interval
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
crawl_interval = random.random() crawl_interval = random.random()
@@ -380,12 +325,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient: async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info( utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
"[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ..." cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
)
cookie_str, cookie_dict = utils.convert_cookies(
await self.browser_context.cookies()
)
xhs_client_obj = XiaoHongShuClient( xhs_client_obj = XiaoHongShuClient(
proxies=httpx_proxy, proxies=httpx_proxy,
headers={ headers={
@@ -412,44 +353,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
return xhs_client_obj return xhs_client_obj
async def launch_browser( async def launch_browser(
self, self,
chromium: BrowserType, chromium: BrowserType,
playwright_proxy: Optional[Dict], playwright_proxy: Optional[Dict],
user_agent: Optional[str], user_agent: Optional[str],
headless: bool = True, headless: bool = True,
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """Launch browser and create browser context"""
utils.logger.info( utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
"[XiaoHongShuCrawler.launch_browser] Begin create browser context ..."
)
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
user_data_dir = os.path.join( user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM) # type: ignore
os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM
) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,
headless=headless, headless=headless,
proxy=playwright_proxy, # type: ignore proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080}, viewport={
"width": 1920,
"height": 1080
},
user_agent=user_agent, user_agent=user_agent,
) )
return browser_context return browser_context
else: else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context( browser_context = await browser.new_context(viewport={"width": 1920, "height": 1080}, user_agent=user_agent)
viewport={"width": 1920, "height": 1080}, user_agent=user_agent
)
return browser_context return browser_context
async def launch_browser_with_cdp( async def launch_browser_with_cdp(
self, self,
playwright: Playwright, playwright: Playwright,
playwright_proxy: Optional[Dict], playwright_proxy: Optional[Dict],
user_agent: Optional[str], user_agent: Optional[str],
headless: bool = True, headless: bool = True,
) -> BrowserContext: ) -> BrowserContext:
""" """
使用CDP模式启动浏览器 使用CDP模式启动浏览器
@@ -470,14 +408,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
return browser_context return browser_context
except Exception as e: except Exception as e:
utils.logger.error( utils.logger.error(f"[XiaoHongShuCrawler] CDP模式启动失败回退到标准模式: {e}")
f"[XiaoHongShuCrawler] CDP模式启动失败回退到标准模式: {e}"
)
# 回退到标准模式 # 回退到标准模式
chromium = playwright.chromium chromium = playwright.chromium
return await self.launch_browser( return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
chromium, playwright_proxy, user_agent, headless
)
async def close(self): async def close(self):
"""Close browser context""" """Close browser context"""
@@ -490,10 +424,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
async def get_notice_media(self, note_detail: Dict): async def get_notice_media(self, note_detail: Dict):
if not config.ENABLE_GET_IMAGES: if not config.ENABLE_GET_MEIDAS:
utils.logger.info( utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled"
)
return return
await self.get_note_images(note_detail) await self.get_note_images(note_detail)
await self.get_notice_video(note_detail) await self.get_notice_video(note_detail)
@@ -504,7 +436,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
:param note_item: :param note_item:
:return: :return:
""" """
if not config.ENABLE_GET_IMAGES: if not config.ENABLE_GET_MEIDAS:
return return
note_id = note_item.get("note_id") note_id = note_item.get("note_id")
image_list: List[Dict] = note_item.get("image_list", []) image_list: List[Dict] = note_item.get("image_list", [])
@@ -529,11 +461,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_notice_video(self, note_item: Dict): async def get_notice_video(self, note_item: Dict):
""" """
get note images. please use get_notice_media get note videos. please use get_notice_media
:param note_item: :param note_item:
:return: :return:
""" """
if not config.ENABLE_GET_IMAGES: if not config.ENABLE_GET_MEIDAS:
return return
note_id = note_item.get("note_id") note_id = note_item.get("note_id")
@@ -548,4 +480,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
continue continue
extension_file_name = f"{videoNum}.mp4" extension_file_name = f"{videoNum}.mp4"
videoNum += 1 videoNum += 1
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name) await xhs_store.update_xhs_note_video(note_id, content, extension_file_name)

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 19:34 # @Time : 2024/1/14 19:34
@@ -20,7 +19,7 @@ import config
from var import source_keyword_var from var import source_keyword_var
from .bilibili_store_impl import * from .bilibili_store_impl import *
from .bilibilli_store_video import * from .bilibilli_store_media import *
class BiliStoreFactory: class BiliStoreFactory:
@@ -35,9 +34,7 @@ class BiliStoreFactory:
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = BiliStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError( raise ValueError("[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...")
"[BiliStoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ..."
)
return store_class() return store_class()
@@ -68,9 +65,7 @@ async def update_bilibili_video(video_item: Dict):
"video_cover_url": video_item_view.get("pic", ""), "video_cover_url": video_item_view.get("pic", ""),
"source_keyword": source_keyword_var.get(), "source_keyword": source_keyword_var.get(),
} }
utils.logger.info( utils.logger.info(f"[store.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{save_content_item.get('title')}")
f"[store.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{save_content_item.get('title')}"
)
await BiliStoreFactory.create_store().store_content(content_item=save_content_item) await BiliStoreFactory.create_store().store_content(content_item=save_content_item)
@@ -89,9 +84,7 @@ async def update_up_info(video_item: Dict):
"user_rank": video_item_card.get("level_info").get("current_level"), "user_rank": video_item_card.get("level_info").get("current_level"),
"is_official": video_item_card.get("official_verify").get("type"), "is_official": video_item_card.get("official_verify").get("type"),
} }
utils.logger.info( utils.logger.info(f"[store.bilibili.update_up_info] bilibili user_id:{video_item_card.get('mid')}")
f"[store.bilibili.update_up_info] bilibili user_id:{video_item_card.get('mid')}"
)
await BiliStoreFactory.create_store().store_creator(creator=saver_up_info) await BiliStoreFactory.create_store().store_creator(creator=saver_up_info)
@@ -123,9 +116,7 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict):
"like_count": like_count, "like_count": like_count,
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
utils.logger.info( utils.logger.info(f"[store.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {save_comment_item.get('content')}")
f"[store.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {save_comment_item.get('content')}"
)
await BiliStoreFactory.create_store().store_comment(comment_item=save_comment_item) await BiliStoreFactory.create_store().store_comment(comment_item=save_comment_item)
@@ -137,13 +128,11 @@ async def store_video(aid, video_content, extension_file_name):
video_content: video_content:
extension_file_name: extension_file_name:
""" """
await BilibiliVideo().store_video( await BilibiliVideo().store_video({
{ "aid": aid,
"aid": aid, "video_content": video_content,
"video_content": video_content, "extension_file_name": extension_file_name,
"extension_file_name": extension_file_name, })
}
)
async def batch_update_bilibili_creator_fans(creator_info: Dict, fans_list: List[Dict]): async def batch_update_bilibili_creator_fans(creator_info: Dict, fans_list: List[Dict]):
@@ -156,14 +145,10 @@ async def batch_update_bilibili_creator_fans(creator_info: Dict, fans_list: List
"sign": fan_item.get("sign"), "sign": fan_item.get("sign"),
"avatar": fan_item.get("face"), "avatar": fan_item.get("face"),
} }
await update_bilibili_creator_contact( await update_bilibili_creator_contact(creator_info=creator_info, fan_info=fan_info)
creator_info=creator_info, fan_info=fan_info
)
async def batch_update_bilibili_creator_followings( async def batch_update_bilibili_creator_followings(creator_info: Dict, followings_list: List[Dict]):
creator_info: Dict, followings_list: List[Dict]
):
if not followings_list: if not followings_list:
return return
for following_item in followings_list: for following_item in followings_list:
@@ -173,14 +158,10 @@ async def batch_update_bilibili_creator_followings(
"sign": following_item.get("sign"), "sign": following_item.get("sign"),
"avatar": following_item.get("face"), "avatar": following_item.get("face"),
} }
await update_bilibili_creator_contact( await update_bilibili_creator_contact(creator_info=following_info, fan_info=creator_info)
creator_info=following_info, fan_info=creator_info
)
async def batch_update_bilibili_creator_dynamics( async def batch_update_bilibili_creator_dynamics(creator_info: Dict, dynamics_list: List[Dict]):
creator_info: Dict, dynamics_list: List[Dict]
):
if not dynamics_list: if not dynamics_list:
return return
for dynamic_item in dynamics_list: for dynamic_item in dynamics_list:
@@ -203,9 +184,7 @@ async def batch_update_bilibili_creator_dynamics(
"total_forwards": dynamic_forward, "total_forwards": dynamic_forward,
"total_liked": dynamic_like, "total_liked": dynamic_like,
} }
await update_bilibili_creator_dynamic( await update_bilibili_creator_dynamic(creator_info=creator_info, dynamic_info=dynamic_info)
creator_info=creator_info, dynamic_info=dynamic_info
)
async def update_bilibili_creator_contact(creator_info: Dict, fan_info: Dict): async def update_bilibili_creator_contact(creator_info: Dict, fan_info: Dict):

View File

@@ -8,40 +8,42 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : helloteemo # @Author : helloteemo
# @Time : 2024/7/12 20:01 # @Time : 2024/7/12 20:01
# @Desc : bilibili图片保存 # @Desc : bilibili 媒体保存
import pathlib import pathlib
from typing import Dict from typing import Dict
import aiofiles import aiofiles
from base.base_crawler import AbstractStoreImage from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
from tools import utils from tools import utils
class BilibiliVideo(AbstractStoreImage): class BilibiliVideo(AbstractStoreVideo):
video_store_path: str = "data/bilibili/videos" video_store_path: str = "data/bilibili/videos"
async def store_video(self, video_content_item: Dict): async def store_video(self, video_content_item: Dict):
""" """
store content store content
Args: Args:
content_item: video_content_item:
Returns: Returns:
""" """
await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"), await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"), video_content_item.get("extension_file_name"))
video_content_item.get("extension_file_name"))
def make_save_file_name(self, aid: str, extension_file_name: str) -> str: def make_save_file_name(self, aid: str, extension_file_name: str) -> str:
""" """
make save file name by store type make save file name by store type
Args: Args:
aid: aid aid: aid
extension_file_name: video filename with extension
Returns: Returns:
""" """
@@ -50,9 +52,11 @@ class BilibiliVideo(AbstractStoreImage):
async def save_video(self, aid: int, video_content: str, extension_file_name="mp4"): async def save_video(self, aid: int, video_content: str, extension_file_name="mp4"):
""" """
save video to local save video to local
Args: Args:
aid: aid aid: aid
video_content: video content video_content: video content
extension_file_name: video filename with extension
Returns: Returns:

View File

@@ -18,6 +18,7 @@ import config
from var import source_keyword_var from var import source_keyword_var
from .douyin_store_impl import * from .douyin_store_impl import *
from .douyin_store_media import *
class DouyinStoreFactory: class DouyinStoreFactory:
@@ -233,3 +234,33 @@ async def save_creator(user_id: str, creator: Dict):
} }
utils.logger.info(f"[store.douyin.save_creator] creator:{local_db_item}") utils.logger.info(f"[store.douyin.save_creator] creator:{local_db_item}")
await DouyinStoreFactory.create_store().store_creator(local_db_item) await DouyinStoreFactory.create_store().store_creator(local_db_item)
async def update_dy_aweme_image(aweme_id, pic_content, extension_file_name):
"""
更新抖音笔记图片
Args:
aweme_id:
pic_content:
extension_file_name:
Returns:
"""
await DouYinImage().store_image({"aweme_id": aweme_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
async def update_dy_aweme_video(aweme_id, video_content, extension_file_name):
"""
更新抖音短视频
Args:
aweme_id:
video_content:
extension_file_name:
Returns:
"""
await DouYinVideo().store_video({"aweme_id": aweme_id, "video_content": video_content, "extension_file_name": extension_file_name})

View File

@@ -0,0 +1,111 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import pathlib
from typing import Dict
import aiofiles
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
from tools import utils
class DouYinImage(AbstractStoreImage):
image_store_path: str = "data/douyin/images"
async def store_image(self, image_content_item: Dict):
"""
store content
Args:
image_content_item:
Returns:
"""
await self.save_image(image_content_item.get("aweme_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name"))
def make_save_file_name(self, aweme_id: str, extension_file_name: str) -> str:
"""
make save file name by store type
Args:
aweme_id: aweme id
extension_file_name: image filename with extension
Returns:
"""
return f"{self.image_store_path}/{aweme_id}/{extension_file_name}"
async def save_image(self, aweme_id: str, pic_content: str, extension_file_name):
"""
save image to local
Args:
aweme_id: aweme id
pic_content: image content
extension_file_name: image filename with extension
Returns:
"""
pathlib.Path(self.image_store_path + "/" + aweme_id).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(aweme_id, extension_file_name)
async with aiofiles.open(save_file_name, 'wb') as f:
await f.write(pic_content)
utils.logger.info(f"[DouYinImageStoreImplement.save_image] save image {save_file_name} success ...")
class DouYinVideo(AbstractStoreVideo):
video_store_path: str = "data/douyin/videos"
async def store_video(self, video_content_item: Dict):
"""
store content
Args:
video_content_item:
Returns:
"""
await self.save_video(video_content_item.get("aweme_id"), video_content_item.get("video_content"), video_content_item.get("extension_file_name"))
def make_save_file_name(self, aweme_id: str, extension_file_name: str) -> str:
"""
make save file name by store type
Args:
aweme_id: aweme id
extension_file_name: video filename with extension
Returns:
"""
return f"{self.video_store_path}/{aweme_id}/{extension_file_name}"
async def save_video(self, aweme_id: str, video_content: str, extension_file_name):
"""
save video to local
Args:
aweme_id: aweme id
video_content: video content
extension_file_name: video filename with extension
Returns:
"""
pathlib.Path(self.video_store_path + "/" + aweme_id).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(aweme_id, extension_file_name)
async with aiofiles.open(save_file_name, 'wb') as f:
await f.write(video_content)
utils.logger.info(f"[DouYinVideoStoreImplement.save_video] save video {save_file_name} success ...")

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 21:34 # @Time : 2024/1/14 21:34
@@ -19,7 +18,7 @@ from typing import List
from var import source_keyword_var from var import source_keyword_var
from .weibo_store_image import * from .weibo_store_media import *
from .weibo_store_impl import * from .weibo_store_impl import *
@@ -35,8 +34,7 @@ class WeibostoreFactory:
def create_store() -> AbstractStore: def create_store() -> AbstractStore:
store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION) store_class = WeibostoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class: if not store_class:
raise ValueError( raise ValueError("[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...")
"[WeibotoreFactory.create_store] Invalid save option only supported csv or db or json or sqlite ...")
return store_class() return store_class()
@@ -91,11 +89,9 @@ async def update_weibo_note(note_item: Dict):
"gender": user_info.get("gender", ""), "gender": user_info.get("gender", ""),
"profile_url": user_info.get("profile_url", ""), "profile_url": user_info.get("profile_url", ""),
"avatar": user_info.get("profile_image_url", ""), "avatar": user_info.get("profile_image_url", ""),
"source_keyword": source_keyword_var.get(), "source_keyword": source_keyword_var.get(),
} }
utils.logger.info( utils.logger.info(f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
await WeibostoreFactory.create_store().store_content(content_item=save_content_item) await WeibostoreFactory.create_store().store_content(content_item=save_content_item)
@@ -150,8 +146,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"profile_url": user_info.get("profile_url", ""), "profile_url": user_info.get("profile_url", ""),
"avatar": user_info.get("profile_image_url", ""), "avatar": user_info.get("profile_image_url", ""),
} }
utils.logger.info( utils.logger.info(f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item) await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
@@ -166,8 +161,7 @@ async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
Returns: Returns:
""" """
await WeiboStoreImage().store_image( await WeiboStoreImage().store_image({"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name})
{"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name})
async def save_creator(user_id: str, user_info: Dict): async def save_creator(user_id: str, user_info: Dict):

View File

@@ -8,17 +8,16 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : Erm # @Author : Erm
# @Time : 2024/4/9 17:35 # @Time : 2024/4/9 17:35
# @Desc : 微博保存图片类 # @Desc : 微博媒体保存
import pathlib import pathlib
from typing import Dict from typing import Dict
import aiofiles import aiofiles
from base.base_crawler import AbstractStoreImage from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
from tools import utils from tools import utils
@@ -28,8 +27,9 @@ class WeiboStoreImage(AbstractStoreImage):
async def store_image(self, image_content_item: Dict): async def store_image(self, image_content_item: Dict):
""" """
store content store content
Args: Args:
content_item: image_content_item:
Returns: Returns:
@@ -39,8 +39,10 @@ class WeiboStoreImage(AbstractStoreImage):
def make_save_file_name(self, picid: str, extension_file_name: str) -> str: def make_save_file_name(self, picid: str, extension_file_name: str) -> str:
""" """
make save file name by store type make save file name by store type
Args: Args:
picid: image id picid: image id
extension_file_name: video filename with extension
Returns: Returns:
@@ -50,9 +52,11 @@ class WeiboStoreImage(AbstractStoreImage):
async def save_image(self, picid: str, pic_content: str, extension_file_name="jpg"): async def save_image(self, picid: str, pic_content: str, extension_file_name="jpg"):
""" """
save image to local save image to local
Args: Args:
picid: image id picid: image id
pic_content: image content pic_content: image content
extension_file_name: image filename with extension
Returns: Returns:

View File

@@ -8,7 +8,6 @@
# 详细许可条款请参阅项目根目录下的LICENSE文件。 # 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com # @Author : relakkes@gmail.com
# @Time : 2024/1/14 17:34 # @Time : 2024/1/14 17:34
@@ -19,7 +18,7 @@ import config
from var import source_keyword_var from var import source_keyword_var
from . import xhs_store_impl from . import xhs_store_impl
from .xhs_store_image import * from .xhs_store_media import *
from .xhs_store_impl import * from .xhs_store_impl import *
@@ -28,7 +27,7 @@ class XhsStoreFactory:
"csv": XhsCsvStoreImplement, "csv": XhsCsvStoreImplement,
"db": XhsDbStoreImplement, "db": XhsDbStoreImplement,
"json": XhsJsonStoreImplement, "json": XhsJsonStoreImplement,
"sqlite": XhsSqliteStoreImplement "sqlite": XhsSqliteStoreImplement,
} }
@staticmethod @staticmethod
@@ -88,27 +87,27 @@ async def update_xhs_note(note_item: Dict):
video_url = ','.join(get_video_url_arr(note_item)) video_url = ','.join(get_video_url_arr(note_item))
local_db_item = { local_db_item = {
"note_id": note_item.get("note_id"), # 帖子id "note_id": note_item.get("note_id"), # 帖子id
"type": note_item.get("type"), # 帖子类型 "type": note_item.get("type"), # 帖子类型
"title": note_item.get("title") or note_item.get("desc", "")[:255], # 帖子标题 "title": note_item.get("title") or note_item.get("desc", "")[:255], # 帖子标题
"desc": note_item.get("desc", ""), # 帖子描述 "desc": note_item.get("desc", ""), # 帖子描述
"video_url": video_url, # 帖子视频url "video_url": video_url, # 帖子视频url
"time": note_item.get("time"), # 帖子发布时间 "time": note_item.get("time"), # 帖子发布时间
"last_update_time": note_item.get("last_update_time", 0), # 帖子最后更新时间 "last_update_time": note_item.get("last_update_time", 0), # 帖子最后更新时间
"user_id": user_info.get("user_id"), # 用户id "user_id": user_info.get("user_id"), # 用户id
"nickname": user_info.get("nickname"), # 用户昵称 "nickname": user_info.get("nickname"), # 用户昵称
"avatar": user_info.get("avatar"), # 用户头像 "avatar": user_info.get("avatar"), # 用户头像
"liked_count": interact_info.get("liked_count"), # 点赞数 "liked_count": interact_info.get("liked_count"), # 点赞数
"collected_count": interact_info.get("collected_count"), # 收藏数 "collected_count": interact_info.get("collected_count"), # 收藏数
"comment_count": interact_info.get("comment_count"), # 评论数 "comment_count": interact_info.get("comment_count"), # 评论数
"share_count": interact_info.get("share_count"), # 分享数 "share_count": interact_info.get("share_count"), # 分享数
"ip_location": note_item.get("ip_location", ""), # ip地址 "ip_location": note_item.get("ip_location", ""), # ip地址
"image_list": ','.join([img.get('url', '') for img in image_list]), # 图片url "image_list": ','.join([img.get('url', '') for img in image_list]), # 图片url
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), # 标签 "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), # 标签
"last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳MediaCrawler程序生成的主要用途在db存储的时候记录一条记录最新更新时间 "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳MediaCrawler程序生成的主要用途在db存储的时候记录一条记录最新更新时间
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", # 帖子url "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", # 帖子url
"source_keyword": source_keyword_var.get(), # 搜索关键词 "source_keyword": source_keyword_var.get(), # 搜索关键词
"xsec_token": note_item.get("xsec_token"), # xsec_token "xsec_token": note_item.get("xsec_token"), # xsec_token
} }
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}") utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
await XhsStoreFactory.create_store().store_content(local_db_item) await XhsStoreFactory.create_store().store_content(local_db_item)
@@ -145,18 +144,18 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])] comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])]
target_comment = comment_item.get("target_comment", {}) target_comment = comment_item.get("target_comment", {})
local_db_item = { local_db_item = {
"comment_id": comment_id, # 评论id "comment_id": comment_id, # 评论id
"create_time": comment_item.get("create_time"), # 评论时间 "create_time": comment_item.get("create_time"), # 评论时间
"ip_location": comment_item.get("ip_location"), # ip地址 "ip_location": comment_item.get("ip_location"), # ip地址
"note_id": note_id, # 帖子id "note_id": note_id, # 帖子id
"content": comment_item.get("content"), # 评论内容 "content": comment_item.get("content"), # 评论内容
"user_id": user_info.get("user_id"), # 用户id "user_id": user_info.get("user_id"), # 用户id
"nickname": user_info.get("nickname"), # 用户昵称 "nickname": user_info.get("nickname"), # 用户昵称
"avatar": user_info.get("image"), # 用户头像 "avatar": user_info.get("image"), # 用户头像
"sub_comment_count": comment_item.get("sub_comment_count", 0), # 子评论数 "sub_comment_count": comment_item.get("sub_comment_count", 0), # 子评论数
"pictures": ",".join(comment_pictures), # 评论图片 "pictures": ",".join(comment_pictures), # 评论图片
"parent_comment_id": target_comment.get("id", 0), # 父评论id "parent_comment_id": target_comment.get("id", 0), # 父评论id
"last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳MediaCrawler程序生成的主要用途在db存储的时候记录一条记录最新更新时间 "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳MediaCrawler程序生成的主要用途在db存储的时候记录一条记录最新更新时间
"like_count": comment_item.get("like_count", 0), "like_count": comment_item.get("like_count", 0),
} }
utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}") utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}")
@@ -197,16 +196,16 @@ async def save_creator(user_id: str, creator: Dict):
local_db_item = { local_db_item = {
'user_id': user_id, # 用户id 'user_id': user_id, # 用户id
'nickname': user_info.get('nickname'), # 昵称 'nickname': user_info.get('nickname'), # 昵称
'gender': get_gender(user_info.get('gender')), # 性别 'gender': get_gender(user_info.get('gender')), # 性别
'avatar': user_info.get('images'), # 头像 'avatar': user_info.get('images'), # 头像
'desc': user_info.get('desc'), # 个人描述 'desc': user_info.get('desc'), # 个人描述
'ip_location': user_info.get('ipLocation'), # ip地址 'ip_location': user_info.get('ipLocation'), # ip地址
'follows': follows, # 关注数 'follows': follows, # 关注数
'fans': fans, # 粉丝数 'fans': fans, # 粉丝数
'interaction': interaction, # 互动数 'interaction': interaction, # 互动数
'tag_list': json.dumps({tag.get('tagType'): tag.get('name') for tag in creator.get('tags')}, 'tag_list': json.dumps({tag.get('tagType'): tag.get('name')
ensure_ascii=False), # 标签 for tag in creator.get('tags')}, ensure_ascii=False), # 标签
"last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳MediaCrawler程序生成的主要用途在db存储的时候记录一条记录最新更新时间 "last_modify_ts": utils.get_current_timestamp(), # 最后更新时间戳MediaCrawler程序生成的主要用途在db存储的时候记录一条记录最新更新时间
} }
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}") utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
await XhsStoreFactory.create_store().store_creator(local_db_item) await XhsStoreFactory.create_store().store_creator(local_db_item)
@@ -214,7 +213,7 @@ async def save_creator(user_id: str, creator: Dict):
async def update_xhs_note_image(note_id, pic_content, extension_file_name): async def update_xhs_note_image(note_id, pic_content, extension_file_name):
""" """
更新小红书笔 更新小红书笔记图片
Args: Args:
note_id: note_id:
pic_content: pic_content:
@@ -224,5 +223,19 @@ async def update_xhs_note_image(note_id, pic_content, extension_file_name):
""" """
await XiaoHongShuImage().store_image( await XiaoHongShuImage().store_image({"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
{"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
async def update_xhs_note_video(note_id, video_content, extension_file_name):
"""
更新小红书笔记视频
Args:
note_id:
video_content:
extension_file_name:
Returns:
"""
await XiaoHongShuVideo().store_video({"notice_id": note_id, "video_content": video_content, "extension_file_name": extension_file_name})

View File

@@ -1,66 +0,0 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : helloteemo
# @Time : 2024/7/11 22:35
# @Desc : 小红书图片保存
import pathlib
from typing import Dict
import aiofiles
from base.base_crawler import AbstractStoreImage
from tools import utils
class XiaoHongShuImage(AbstractStoreImage):
image_store_path: str = "data/xhs/images"
async def store_image(self, image_content_item: Dict):
"""
store content
Args:
content_item:
Returns:
"""
await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"),
image_content_item.get("extension_file_name"))
def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
"""
make save file name by store type
Args:
notice_id: notice id
picid: image id
Returns:
"""
return f"{self.image_store_path}/{notice_id}/{extension_file_name}"
async def save_image(self, notice_id: str, pic_content: str, extension_file_name="jpg"):
"""
save image to local
Args:
notice_id: notice id
pic_content: image content
Returns:
"""
pathlib.Path(self.image_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(notice_id, extension_file_name)
async with aiofiles.open(save_file_name, 'wb') as f:
await f.write(pic_content)
utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...")

View File

@@ -0,0 +1,115 @@
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
# -*- coding: utf-8 -*-
# @Author : helloteemo
# @Time : 2024/7/11 22:35
# @Desc : 小红书媒体保存
import pathlib
from typing import Dict
import aiofiles
from base.base_crawler import AbstractStoreImage, AbstractStoreVideo
from tools import utils
class XiaoHongShuImage(AbstractStoreImage):
image_store_path: str = "data/xhs/images"
async def store_image(self, image_content_item: Dict):
"""
store content
Args:
image_content_item:
Returns:
"""
await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name"))
def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
"""
make save file name by store type
Args:
notice_id: notice id
extension_file_name: image filename with extension
Returns:
"""
return f"{self.image_store_path}/{notice_id}/{extension_file_name}"
async def save_image(self, notice_id: str, pic_content: str, extension_file_name):
"""
save image to local
Args:
notice_id: notice id
pic_content: image content
extension_file_name: image filename with extension
Returns:
"""
pathlib.Path(self.image_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(notice_id, extension_file_name)
async with aiofiles.open(save_file_name, 'wb') as f:
await f.write(pic_content)
utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...")
class XiaoHongShuVideo(AbstractStoreVideo):
video_store_path: str = "data/xhs/videos"
async def store_video(self, video_content_item: Dict):
"""
store content
Args:
video_content_item:
Returns:
"""
await self.save_video(video_content_item.get("notice_id"), video_content_item.get("video_content"), video_content_item.get("extension_file_name"))
def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
"""
make save file name by store type
Args:
notice_id: notice id
extension_file_name: video filename with extension
Returns:
"""
return f"{self.video_store_path}/{notice_id}/{extension_file_name}"
async def save_video(self, notice_id: str, video_content: str, extension_file_name):
"""
save video to local
Args:
notice_id: notice id
video_content: video content
extension_file_name: video filename with extension
Returns:
"""
pathlib.Path(self.video_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(notice_id, extension_file_name)
async with aiofiles.open(save_file_name, 'wb') as f:
await f.write(video_content)
utils.logger.info(f"[XiaoHongShuVideoStoreImplement.save_video] save video {save_file_name} success ...")