feat(core): 新增控制爬虫参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

2025-11-25 03:15:17 +08:00 · 2024-04-12 00:52:47 +08:00
parent bba9841c26
commit 1115b0d90c
9 changed files with 94 additions and 40 deletions
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
        self.index_url = "https://www.xiaohongshu.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword

    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
@@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
        xhs_limit_count = 20  # xhs limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
            utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
            page = 1
-            while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                note_id_list: List[str] = []
-                notes_res = await self.xhs_client.get_note_by_keyword(
-                    keyword=keyword,
-                    page=page,
-                    sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
-                )
-                utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
-                semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-                task_list = [
-                    self.get_note_detail(post_item.get("id"), semaphore)
-                    for post_item in notes_res.get("items", {})
-                    if post_item.get('model_type') not in ('rec_query', 'hot_query')
-                ]
-                note_details = await asyncio.gather(*task_list)
-                for note_detail in note_details:
-                    if note_detail is not None:
-                        await xhs_store.update_xhs_note(note_detail)
-                        note_id_list.append(note_detail.get("note_id"))
-                page += 1
-                utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
-                await self.batch_get_note_comments(note_id_list)
+            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
+                    page += 1
+                    continue
+
+                try:
+                    note_id_list: List[str] = []
+                    notes_res = await self.xhs_client.get_note_by_keyword(
+                        keyword=keyword,
+                        page=page,
+                        sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
+                    )
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
+                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                    task_list = [
+                        self.get_note_detail(post_item.get("id"), semaphore)
+                        for post_item in notes_res.get("items", {})
+                        if post_item.get('model_type') not in ('rec_query', 'hot_query')
+                    ]
+                    note_details = await asyncio.gather(*task_list)
+                    for note_detail in note_details:
+                        if note_detail is not None:
+                            await xhs_store.update_xhs_note(note_detail)
+                            note_id_list.append(note_detail.get("note_id"))
+                    page += 1
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
+                    await self.batch_get_note_comments(note_id_list)
+                except DataFetchError:
+                    utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
+                    break

    async def get_creators_and_notes(self) -> None:
        """Get creator's notes and retrieve their comment information."""