mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 19:37:36 +08:00
Merge pull request #699 from 2513502304/main
添加了对媒体资源服务器的异常处理,参见 issue #691
This commit is contained in:
@@ -202,12 +202,17 @@ class BilibiliClient(AbstractApiClient):
|
|||||||
|
|
||||||
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
async def get_video_media(self, url: str) -> Union[bytes, None]:
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
try:
|
||||||
if not response.reason_phrase == "OK":
|
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
|
||||||
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
|
response.raise_for_status()
|
||||||
|
if not response.reason_phrase == "OK":
|
||||||
|
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return response.content
|
||||||
|
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
|
||||||
|
utils.logger.error(f"[BilibiliClient.get_video_media] {exc}")
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
return response.content
|
|
||||||
|
|
||||||
async def get_video_comments(
|
async def get_video_comments(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -544,6 +544,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
return
|
return
|
||||||
|
|
||||||
content = await self.bili_client.get_video_media(video_url)
|
content = await self.bili_client.get_video_media(video_url)
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
if content is None:
|
if content is None:
|
||||||
return
|
return
|
||||||
extension_file_name = f"video.mp4"
|
extension_file_name = f"video.mp4"
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class DouYinClient(AbstractApiClient):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
timeout=30, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
|
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
|
||||||
proxy=None,
|
proxy=None,
|
||||||
*,
|
*,
|
||||||
headers: Dict,
|
headers: Dict,
|
||||||
@@ -305,7 +305,7 @@ class DouYinClient(AbstractApiClient):
|
|||||||
posts_has_more = aweme_post_res.get("has_more", 0)
|
posts_has_more = aweme_post_res.get("has_more", 0)
|
||||||
max_cursor = aweme_post_res.get("max_cursor")
|
max_cursor = aweme_post_res.get("max_cursor")
|
||||||
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
|
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
|
||||||
utils.logger.info(f"[DouYinCrawler.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
|
utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
|
||||||
if callback:
|
if callback:
|
||||||
await callback(aweme_list)
|
await callback(aweme_list)
|
||||||
result.extend(aweme_list)
|
result.extend(aweme_list)
|
||||||
@@ -313,9 +313,14 @@ class DouYinClient(AbstractApiClient):
|
|||||||
|
|
||||||
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
try:
|
||||||
if not response.reason_phrase == "OK":
|
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
|
||||||
utils.logger.error(f"[DouYinCrawler.get_aweme_media] request {url} err, res:{response.text}")
|
response.raise_for_status()
|
||||||
|
if not response.reason_phrase == "OK":
|
||||||
|
utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return response.content
|
||||||
|
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
|
||||||
|
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}")
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
return response.content
|
|
||||||
|
|||||||
@@ -362,6 +362,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
content = await self.dy_client.get_aweme_media(url)
|
content = await self.dy_client.get_aweme_media(url)
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
if content is None:
|
if content is None:
|
||||||
continue
|
continue
|
||||||
extension_file_name = f"{picNum:>03d}.jpeg"
|
extension_file_name = f"{picNum:>03d}.jpeg"
|
||||||
@@ -385,6 +386,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
if not video_download_url:
|
if not video_download_url:
|
||||||
return
|
return
|
||||||
content = await self.dy_client.get_aweme_media(video_download_url)
|
content = await self.dy_client.get_aweme_media(video_download_url)
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
if content is None:
|
if content is None:
|
||||||
return
|
return
|
||||||
extension_file_name = f"video.mp4"
|
extension_file_name = f"video.mp4"
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class WeiboClient:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
timeout=30, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
|
||||||
proxy=None,
|
proxy=None,
|
||||||
*,
|
*,
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
@@ -248,12 +248,17 @@ class WeiboClient:
|
|||||||
final_uri = (f"{self._image_agent_host}"
|
final_uri = (f"{self._image_agent_host}"
|
||||||
f"{image_url}")
|
f"{image_url}")
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
response = await client.request("GET", final_uri, timeout=self.timeout)
|
try:
|
||||||
if not response.reason_phrase == "OK":
|
response = await client.request("GET", final_uri, timeout=self.timeout)
|
||||||
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
response.raise_for_status()
|
||||||
|
if not response.reason_phrase == "OK":
|
||||||
|
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return response.content
|
||||||
|
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
|
||||||
|
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}")
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
return response.content
|
|
||||||
|
|
||||||
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -250,6 +250,7 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
content = await self.wb_client.get_note_image(url)
|
content = await self.wb_client.get_note_image(url)
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
if content != None:
|
if content != None:
|
||||||
extension_file_name = url.split(".")[-1]
|
extension_file_name = url.split(".")[-1]
|
||||||
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
timeout=30, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
|
timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
|
||||||
proxy=None,
|
proxy=None,
|
||||||
*,
|
*,
|
||||||
headers: Dict[str, str],
|
headers: Dict[str, str],
|
||||||
@@ -152,12 +152,17 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
|
|
||||||
async def get_note_media(self, url: str) -> Union[bytes, None]:
|
async def get_note_media(self, url: str) -> Union[bytes, None]:
|
||||||
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
async with httpx.AsyncClient(proxy=self.proxy) as client:
|
||||||
response = await client.request("GET", url, timeout=self.timeout)
|
try:
|
||||||
if not response.reason_phrase == "OK":
|
response = await client.request("GET", url, timeout=self.timeout)
|
||||||
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
|
response.raise_for_status()
|
||||||
|
if not response.reason_phrase == "OK":
|
||||||
|
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return response.content
|
||||||
|
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
|
||||||
|
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}")
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
return response.content
|
|
||||||
|
|
||||||
async def pong(self) -> bool:
|
async def pong(self) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -453,6 +453,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
content = await self.xhs_client.get_note_media(url)
|
content = await self.xhs_client.get_note_media(url)
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
if content is None:
|
if content is None:
|
||||||
continue
|
continue
|
||||||
extension_file_name = f"{picNum}.jpg"
|
extension_file_name = f"{picNum}.jpg"
|
||||||
@@ -476,6 +477,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
videoNum = 0
|
videoNum = 0
|
||||||
for url in videos:
|
for url in videos:
|
||||||
content = await self.xhs_client.get_note_media(url)
|
content = await self.xhs_client.get_note_media(url)
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
if content is None:
|
if content is None:
|
||||||
continue
|
continue
|
||||||
extension_file_name = f"{videoNum}.mp4"
|
extension_file_name = f"{videoNum}.mp4"
|
||||||
|
|||||||
Reference in New Issue
Block a user