From 81f2dbe4ab71f4bb5f2a65d9f9ba1333d3aea3ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=AA=E6=9D=A5=E5=8F=AF=E6=AC=BA?= <2513502304@qq.com> Date: Tue, 5 Aug 2025 13:11:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E5=AF=B9=E5=AA=92?= =?UTF-8?q?=E4=BD=93=E8=B5=84=E6=BA=90=E6=9C=8D=E5=8A=A1=E5=99=A8=E7=9A=84?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=E5=A4=84=E7=90=86=EF=BC=8C=E5=8F=82=E8=A7=81?= =?UTF-8?q?=20issue=20#691?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/bilibili/client.py | 15 ++++++++++----- media_platform/bilibili/core.py | 1 + media_platform/douyin/client.py | 19 ++++++++++++------- media_platform/douyin/core.py | 2 ++ media_platform/weibo/client.py | 17 +++++++++++------ media_platform/weibo/core.py | 1 + media_platform/xhs/client.py | 17 +++++++++++------ media_platform/xhs/core.py | 2 ++ 8 files changed, 50 insertions(+), 24 deletions(-) diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 7ace00d..1aed914 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -202,12 +202,17 @@ class BilibiliClient(AbstractApiClient): async def get_video_media(self, url: str) -> Union[bytes, None]: async with httpx.AsyncClient(proxy=self.proxy) as client: - response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) - if not response.reason_phrase == "OK": - utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}") + try: + response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error + utils.logger.error(f"[BilibiliClient.get_video_media] {exc}") return None - else: - return response.content async def get_video_comments( self, diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 2c5ed91..e63d31a 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -544,6 +544,7 @@ class BilibiliCrawler(AbstractCrawler): return content = await self.bili_client.get_video_media(video_url) + await asyncio.sleep(random.random()) if content is None: return extension_file_name = f"video.mp4" diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index 4a3770a..6722eef 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -30,7 +30,7 @@ class DouYinClient(AbstractApiClient): def __init__( self, - timeout=30, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间 + timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间 proxy=None, *, headers: Dict, @@ -305,7 +305,7 @@ class DouYinClient(AbstractApiClient): posts_has_more = aweme_post_res.get("has_more", 0) max_cursor = aweme_post_res.get("max_cursor") aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else [] - utils.logger.info(f"[DouYinCrawler.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}") + utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}") if callback: await callback(aweme_list) result.extend(aweme_list) @@ -313,9 +313,14 @@ class DouYinClient(AbstractApiClient): async def get_aweme_media(self, url: str) -> Union[bytes, None]: async with httpx.AsyncClient(proxy=self.proxy) as client: - response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True) - if not response.reason_phrase == "OK": - utils.logger.error(f"[DouYinCrawler.get_aweme_media] request {url} err, res:{response.text}") + try: + response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error + utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}") return None - else: - return response.content diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index cc6a21f..1d7ce4d 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -362,6 +362,7 @@ class DouYinCrawler(AbstractCrawler): if not url: continue content = await self.dy_client.get_aweme_media(url) + await asyncio.sleep(random.random()) if content is None: continue extension_file_name = f"{picNum:>03d}.jpeg" @@ -385,6 +386,7 @@ class DouYinCrawler(AbstractCrawler): if not video_download_url: return content = await self.dy_client.get_aweme_media(video_download_url) + await asyncio.sleep(random.random()) if content is None: return extension_file_name = f"video.mp4" diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index a1404d1..ee4b775 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -35,7 +35,7 @@ class WeiboClient: def __init__( self, - timeout=30, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间 + timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间 proxy=None, *, headers: Dict[str, str], @@ -248,12 +248,17 @@ class WeiboClient: final_uri = (f"{self._image_agent_host}" f"{image_url}") async with httpx.AsyncClient(proxy=self.proxy) as client: - response = await client.request("GET", final_uri, timeout=self.timeout) - if not response.reason_phrase == "OK": - utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}") + try: + response = await client.request("GET", final_uri, timeout=self.timeout) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error + utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}") return None - else: - return response.content async def get_creator_container_info(self, creator_id: str) -> Dict: """ diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index f789d33..552801f 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -250,6 +250,7 @@ class WeiboCrawler(AbstractCrawler): if not url: continue content = await self.wb_client.get_note_image(url) + await asyncio.sleep(random.random()) if content != None: extension_file_name = url.split(".")[-1] await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 830c1d6..2850c85 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -32,7 +32,7 @@ class XiaoHongShuClient(AbstractApiClient): def __init__( self, - timeout=30, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间 + timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间 proxy=None, *, headers: Dict[str, str], @@ -152,12 +152,17 @@ class XiaoHongShuClient(AbstractApiClient): async def get_note_media(self, url: str) -> Union[bytes, None]: async with httpx.AsyncClient(proxy=self.proxy) as client: - response = await client.request("GET", url, timeout=self.timeout) - if not response.reason_phrase == "OK": - utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") + try: + response = await client.request("GET", url, timeout=self.timeout) + response.raise_for_status() + if not response.reason_phrase == "OK": + utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") + return None + else: + return response.content + except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error + utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}") return None - else: - return response.content async def pong(self) -> bool: """ diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 8526a3e..9c88f1c 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -453,6 +453,7 @@ class XiaoHongShuCrawler(AbstractCrawler): if not url: continue content = await self.xhs_client.get_note_media(url) + await asyncio.sleep(random.random()) if content is None: continue extension_file_name = f"{picNum}.jpg" @@ -476,6 +477,7 @@ class XiaoHongShuCrawler(AbstractCrawler): videoNum = 0 for url in videos: content = await self.xhs_client.get_note_media(url) + await asyncio.sleep(random.random()) if content is None: continue extension_file_name = f"{videoNum}.mp4"