Merge pull request #513 from liudongkai/feature_xhs

feat: xhs 非代理模式下增加随机等待间隔, db存储模式下增加存储xsec_token字段
2025-11-25 03:15:17 +08:00 · 2024-12-06 13:01:16 +08:00
parent 9c7e1d499b 33e7ef016d
commit dc9116e098
4 changed files with 29 additions and 4 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -21,10 +21,15 @@ PUBLISH_TIME_TYPE = 0
 CRAWLER_TYPE = (
    "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
 )
+# 自定义User Agent（暂时仅对XHS有效）
+UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'

 # 是否开启 IP 代理
 ENABLE_IP_PROXY = False

+# 未启用代理时的最大爬取间隔，单位秒（暂时仅对XHS有效）
+CRAWLER_MAX_SLEEP_SEC = 2
+
 # 代理IP池数量
 IP_PROXY_POOL_COUNT = 2

--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -12,6 +12,7 @@
 import asyncio
 import os
 import random
+import time
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple

@@ -42,7 +43,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
    def __init__(self) -> None:
        self.index_url = "https://www.xiaohongshu.com"
        # self.user_agent = utils.get_user_agent()
-        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
+        self.user_agent = config.UA if config.UA else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"

    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
@@ -195,10 +196,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
            if createor_info:
                await xhs_store.save_creator(user_id, creator=createor_info)

+            # When proxy is not enabled, increase the crawling interval
+            if config.ENABLE_IP_PROXY:
+                crawl_interval = random.random()
+            else:
+                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
            # Get all note information of the creator
            all_notes_list = await self.xhs_client.get_all_notes_by_creator(
                user_id=user_id,
-                crawl_interval=random.random(),
+                crawl_interval=crawl_interval,
                callback=self.fetch_creator_notes_detail,
            )

@@ -280,6 +286,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
        """
        note_detail_from_html, note_detail_from_api = None, None
        async with semaphore:
+            # When proxy is not enabled, increase the crawling interval
+            if config.ENABLE_IP_PROXY:
+                crawl_interval = random.random()
+            else:
+                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
            try:
                # 尝试直接获取网页版笔记详情，携带cookie
                note_detail_from_html: Optional[Dict] = (
@@ -287,6 +298,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
                        note_id, xsec_source, xsec_token, enable_cookie=True
                    )
                )
+                time.sleep(crawl_interval)
                if not note_detail_from_html:
                    # 如果网页版笔记详情获取失败，则尝试不使用cookie获取
                    note_detail_from_html = (
@@ -354,10 +366,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
            utils.logger.info(
                f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}"
            )
+            # When proxy is not enabled, increase the crawling interval
+            if config.ENABLE_IP_PROXY:
+                crawl_interval = random.random()
+            else:
+                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
            await self.xhs_client.get_note_all_comments(
                note_id=note_id,
                xsec_token=xsec_token,
-                crawl_interval=random.random(),
+                crawl_interval=crawl_interval,
                callback=xhs_store.batch_update_xhs_note_comments,
                max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
            )
--- a/schema/tables.sql
+++ b/schema/tables.sql
@@ -534,4 +534,6 @@ CREATE TABLE `zhihu_creator` (


 -- add column `like_count` to douyin_aweme_comment
-alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数';
+alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数';
+
+alter table xhs_note add column xsec_token varchar(50) default null comment '签名算法';
--- a/store/xhs/init.py
+++ b/store/xhs/init.py
@@ -107,6 +107,7 @@ async def update_xhs_note(note_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
        "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",
        "source_keyword": source_keyword_var.get(),
+        "xsec_token": note_item.get("xsec_token"),
    }
    utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
    await XhsStoreFactory.create_store().store_content(local_db_item)