mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 19:37:36 +08:00
This commit introduces several improvements to enhance the stability and functionality of the Bilibili crawler. - **Add Retry Logic:** Implement a retry mechanism with exponential backoff when fetching video comments. This makes the crawler more resilient to transient network issues or API errors. - **Improve Error Handling:** Add a `try...except` block to handle potential `JSONDecodeError` in the Bilibili client, preventing crashes when the API returns an invalid response. - **Ensure Clean Shutdown:** Refactor `main.py` to use a `try...finally` block, guaranteeing that the crawler and database connections are properly closed on exit, error, or `KeyboardInterrupt`. - **Update Default Config:** Adjust default configuration values to increase concurrency, enable word cloud generation by default, and refine the Bilibili search mode for more practical usage.
74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
|
# 1. 不得用于任何商业用途。
|
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
|
# 5. 不得用于任何非法或不当的用途。
|
|
#
|
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
|
|
|
|
|
import asyncio
|
|
import sys
|
|
from typing import Optional
|
|
|
|
import cmd_arg
|
|
import config
|
|
import db
|
|
from base.base_crawler import AbstractCrawler
|
|
from media_platform.bilibili import BilibiliCrawler
|
|
from media_platform.douyin import DouYinCrawler
|
|
from media_platform.kuaishou import KuaishouCrawler
|
|
from media_platform.tieba import TieBaCrawler
|
|
from media_platform.weibo import WeiboCrawler
|
|
from media_platform.xhs import XiaoHongShuCrawler
|
|
from media_platform.zhihu import ZhihuCrawler
|
|
|
|
|
|
class CrawlerFactory:
|
|
CRAWLERS = {
|
|
"xhs": XiaoHongShuCrawler,
|
|
"dy": DouYinCrawler,
|
|
"ks": KuaishouCrawler,
|
|
"bili": BilibiliCrawler,
|
|
"wb": WeiboCrawler,
|
|
"tieba": TieBaCrawler,
|
|
"zhihu": ZhihuCrawler
|
|
}
|
|
|
|
@staticmethod
|
|
def create_crawler(platform: str) -> AbstractCrawler:
|
|
crawler_class = CrawlerFactory.CRAWLERS.get(platform)
|
|
if not crawler_class:
|
|
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
|
|
return crawler_class()
|
|
|
|
async def main():
|
|
# Init crawler
|
|
crawler: Optional[AbstractCrawler] = None
|
|
try:
|
|
# parse cmd
|
|
await cmd_arg.parse_cmd()
|
|
|
|
# init db
|
|
if config.SAVE_DATA_OPTION == "db":
|
|
await db.init_db()
|
|
|
|
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
|
await crawler.start()
|
|
finally:
|
|
if crawler:
|
|
await crawler.close()
|
|
if config.SAVE_DATA_OPTION == "db":
|
|
await db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
# asyncio.run(main())
|
|
asyncio.get_event_loop().run_until_complete(main())
|
|
except KeyboardInterrupt:
|
|
print("\n[main] Caught keyboard interrupt, exiting.")
|
|
sys.exit()
|