mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-26 03:45:00 +08:00
feat: xhs 非代理模式下增加随机等待间隔, db存储模式下增加存储xsec_token字段
This commit is contained in:
@@ -21,10 +21,15 @@ PUBLISH_TIME_TYPE = 0
|
|||||||
CRAWLER_TYPE = (
|
CRAWLER_TYPE = (
|
||||||
"search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
"search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||||
)
|
)
|
||||||
|
# 自定义User Agent(暂时仅对XHS有效)
|
||||||
|
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
|
||||||
|
|
||||||
# 是否开启 IP 代理
|
# 是否开启 IP 代理
|
||||||
ENABLE_IP_PROXY = False
|
ENABLE_IP_PROXY = False
|
||||||
|
|
||||||
|
# 未启用代理时的最大爬取间隔,单位秒(暂时仅对XHS有效)
|
||||||
|
CRAWLER_MAX_SLEEP_SEC = 2
|
||||||
|
|
||||||
# 代理IP池数量
|
# 代理IP池数量
|
||||||
IP_PROXY_POOL_COUNT = 2
|
IP_PROXY_POOL_COUNT = 2
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
from asyncio import Task
|
from asyncio import Task
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
@@ -42,7 +43,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
# self.user_agent = utils.get_user_agent()
|
# self.user_agent = utils.get_user_agent()
|
||||||
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
self.user_agent = config.UA if config.UA else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
@@ -195,10 +196,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
if createor_info:
|
if createor_info:
|
||||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||||
|
|
||||||
|
# When proxy is not enabled, increase the crawling interval
|
||||||
|
if config.ENABLE_IP_PROXY:
|
||||||
|
crawl_interval = random.random()
|
||||||
|
else:
|
||||||
|
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||||
# Get all note information of the creator
|
# Get all note information of the creator
|
||||||
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
|
all_notes_list = await self.xhs_client.get_all_notes_by_creator(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=crawl_interval,
|
||||||
callback=self.fetch_creator_notes_detail,
|
callback=self.fetch_creator_notes_detail,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -280,6 +286,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
"""
|
"""
|
||||||
note_detail_from_html, note_detail_from_api = None, None
|
note_detail_from_html, note_detail_from_api = None, None
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
|
# When proxy is not enabled, increase the crawling interval
|
||||||
|
if config.ENABLE_IP_PROXY:
|
||||||
|
crawl_interval = random.random()
|
||||||
|
else:
|
||||||
|
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||||
try:
|
try:
|
||||||
# 尝试直接获取网页版笔记详情,携带cookie
|
# 尝试直接获取网页版笔记详情,携带cookie
|
||||||
note_detail_from_html: Optional[Dict] = (
|
note_detail_from_html: Optional[Dict] = (
|
||||||
@@ -287,6 +298,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
note_id, xsec_source, xsec_token, enable_cookie=True
|
note_id, xsec_source, xsec_token, enable_cookie=True
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
time.sleep(crawl_interval)
|
||||||
if not note_detail_from_html:
|
if not note_detail_from_html:
|
||||||
# 如果网页版笔记详情获取失败,则尝试不使用cookie获取
|
# 如果网页版笔记详情获取失败,则尝试不使用cookie获取
|
||||||
note_detail_from_html = (
|
note_detail_from_html = (
|
||||||
@@ -354,10 +366,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}"
|
f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}"
|
||||||
)
|
)
|
||||||
|
# When proxy is not enabled, increase the crawling interval
|
||||||
|
if config.ENABLE_IP_PROXY:
|
||||||
|
crawl_interval = random.random()
|
||||||
|
else:
|
||||||
|
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
|
||||||
await self.xhs_client.get_note_all_comments(
|
await self.xhs_client.get_note_all_comments(
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
xsec_token=xsec_token,
|
xsec_token=xsec_token,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=crawl_interval,
|
||||||
callback=xhs_store.batch_update_xhs_note_comments,
|
callback=xhs_store.batch_update_xhs_note_comments,
|
||||||
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -534,4 +534,6 @@ CREATE TABLE `zhihu_creator` (
|
|||||||
|
|
||||||
|
|
||||||
-- add column `like_count` to douyin_aweme_comment
|
-- add column `like_count` to douyin_aweme_comment
|
||||||
alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数';
|
alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数';
|
||||||
|
|
||||||
|
alter table xhs_note add column xsec_token varchar(50) default null comment '签名算法';
|
||||||
@@ -107,6 +107,7 @@ async def update_xhs_note(note_item: Dict):
|
|||||||
"last_modify_ts": utils.get_current_timestamp(),
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",
|
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",
|
||||||
"source_keyword": source_keyword_var.get(),
|
"source_keyword": source_keyword_var.get(),
|
||||||
|
"xsec_token": note_item.get("xsec_token"),
|
||||||
}
|
}
|
||||||
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
|
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
|
||||||
await XhsStoreFactory.create_store().store_content(local_db_item)
|
await XhsStoreFactory.create_store().store_content(local_db_item)
|
||||||
|
|||||||
Reference in New Issue
Block a user