feat: xhs 非代理模式下增加随机等待间隔, db存储模式下增加存储xsec_token字段

This commit is contained in:
liudongkai
2024-12-05 21:10:31 +08:00
parent 9c7e1d499b
commit 33e7ef016d
4 changed files with 29 additions and 4 deletions

View File

@@ -21,10 +21,15 @@ PUBLISH_TIME_TYPE = 0
CRAWLER_TYPE = ( CRAWLER_TYPE = (
"search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) "search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
) )
# 自定义User Agent暂时仅对XHS有效
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
# 是否开启 IP 代理 # 是否开启 IP 代理
ENABLE_IP_PROXY = False ENABLE_IP_PROXY = False
# 未启用代理时的最大爬取间隔单位秒暂时仅对XHS有效
CRAWLER_MAX_SLEEP_SEC = 2
# 代理IP池数量 # 代理IP池数量
IP_PROXY_POOL_COUNT = 2 IP_PROXY_POOL_COUNT = 2

View File

@@ -12,6 +12,7 @@
import asyncio import asyncio
import os import os
import random import random
import time
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@@ -42,7 +43,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
def __init__(self) -> None: def __init__(self) -> None:
self.index_url = "https://www.xiaohongshu.com" self.index_url = "https://www.xiaohongshu.com"
# self.user_agent = utils.get_user_agent() # self.user_agent = utils.get_user_agent()
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" self.user_agent = config.UA if config.UA else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
@@ -195,10 +196,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
if createor_info: if createor_info:
await xhs_store.save_creator(user_id, creator=createor_info) await xhs_store.save_creator(user_id, creator=createor_info)
# When proxy is not enabled, increase the crawling interval
if config.ENABLE_IP_PROXY:
crawl_interval = random.random()
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
# Get all note information of the creator # Get all note information of the creator
all_notes_list = await self.xhs_client.get_all_notes_by_creator( all_notes_list = await self.xhs_client.get_all_notes_by_creator(
user_id=user_id, user_id=user_id,
crawl_interval=random.random(), crawl_interval=crawl_interval,
callback=self.fetch_creator_notes_detail, callback=self.fetch_creator_notes_detail,
) )
@@ -280,6 +286,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
""" """
note_detail_from_html, note_detail_from_api = None, None note_detail_from_html, note_detail_from_api = None, None
async with semaphore: async with semaphore:
# When proxy is not enabled, increase the crawling interval
if config.ENABLE_IP_PROXY:
crawl_interval = random.random()
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
try: try:
# 尝试直接获取网页版笔记详情携带cookie # 尝试直接获取网页版笔记详情携带cookie
note_detail_from_html: Optional[Dict] = ( note_detail_from_html: Optional[Dict] = (
@@ -287,6 +298,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
note_id, xsec_source, xsec_token, enable_cookie=True note_id, xsec_source, xsec_token, enable_cookie=True
) )
) )
time.sleep(crawl_interval)
if not note_detail_from_html: if not note_detail_from_html:
# 如果网页版笔记详情获取失败则尝试不使用cookie获取 # 如果网页版笔记详情获取失败则尝试不使用cookie获取
note_detail_from_html = ( note_detail_from_html = (
@@ -354,10 +366,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
utils.logger.info( utils.logger.info(
f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}" f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}"
) )
# When proxy is not enabled, increase the crawling interval
if config.ENABLE_IP_PROXY:
crawl_interval = random.random()
else:
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
await self.xhs_client.get_note_all_comments( await self.xhs_client.get_note_all_comments(
note_id=note_id, note_id=note_id,
xsec_token=xsec_token, xsec_token=xsec_token,
crawl_interval=random.random(), crawl_interval=crawl_interval,
callback=xhs_store.batch_update_xhs_note_comments, callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )

View File

@@ -534,4 +534,6 @@ CREATE TABLE `zhihu_creator` (
-- add column `like_count` to douyin_aweme_comment -- add column `like_count` to douyin_aweme_comment
alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数'; alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数';
alter table xhs_note add column xsec_token varchar(50) default null comment '签名算法';

View File

@@ -107,6 +107,7 @@ async def update_xhs_note(note_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",
"source_keyword": source_keyword_var.get(), "source_keyword": source_keyword_var.get(),
"xsec_token": note_item.get("xsec_token"),
} }
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}") utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
await XhsStoreFactory.create_store().store_content(local_db_item) await XhsStoreFactory.create_store().store_content(local_db_item)