mirror of
https://github.com/Usagi-org/ai-goofish-monitor.git
synced 2025-11-25 11:29:41 +08:00
feat: 引入AI调试模式并提升爬取鲁棒性
- 引入 `AI_DEBUG_MODE` 环境变量,可在控制台打印AI分析的详细输入、输出及API响应,便于问题排查。 - 修复浏览器启动问题,通过明确指定 Playwright 的 `channel` 参数确保使用系统安装的 Chrome/Edge 浏览器。 - 优化 `LOGIN_IS_EDGE` 环境变量的解析逻辑,使其更健壮。 - 增加单个商品处理后的随机延迟时间(15-30秒),以降低被反爬的风险,提升爬取稳定性。 - 更新 `config.json` 中部分任务的默认启用状态。 - 更新 `README.md` 文档,说明 `AI_DEBUG_MODE` 配置。
This commit is contained in:
@@ -75,6 +75,9 @@ pip install -r requirements.txt
|
||||
# 使用docker部署不支持GUI,设置 RUN_HEADLESS=true 否则无法运行。
|
||||
RUN_HEADLESS=true
|
||||
|
||||
# (可选) AI调试模式 (true/false)。开启后会在控制台打印更多用于排查AI分析问题的日志。
|
||||
AI_DEBUG_MODE=false
|
||||
|
||||
# 服务端口自定义 不配置默认8000
|
||||
SERVER_PORT=8000
|
||||
```
|
||||
|
||||
7
login.py
7
login.py
@@ -5,7 +5,7 @@ from playwright.async_api import async_playwright
|
||||
|
||||
# 定义保存登录状态的文件路径
|
||||
STATE_FILE = "xianyu_state.json"
|
||||
LOGIN_IS_EDGE = os.getenv("LOGIN_IS_EDGE")
|
||||
LOGIN_IS_EDGE = os.getenv("LOGIN_IS_EDGE", "false").lower() == "true"
|
||||
|
||||
|
||||
|
||||
@@ -16,7 +16,8 @@ async def main():
|
||||
if LOGIN_IS_EDGE:
|
||||
browser = await p.chromium.launch(headless=False, channel="msedge")
|
||||
else:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
# 明确指定使用系统安装的 Chrome 浏览器,以绕过 Playwright 的内部浏览器管理问题
|
||||
browser = await p.chromium.launch(headless=False, channel="chrome")
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
|
||||
@@ -46,4 +47,4 @@ async def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("正在启动浏览器以进行登录...")
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
35
spider_v2.py
35
spider_v2.py
@@ -35,6 +35,8 @@ NTFY_TOPIC_URL = os.getenv("NTFY_TOPIC_URL")
|
||||
WX_BOT_URL = os.getenv("WX_BOT_URL")
|
||||
PCURL_TO_MOBILE = os.getenv("PCURL_TO_MOBILE")
|
||||
RUN_HEADLESS = os.getenv("RUN_HEADLESS", "true").lower() != "false"
|
||||
LOGIN_IS_EDGE = os.getenv("LOGIN_IS_EDGE", "false").lower() == "true"
|
||||
AI_DEBUG_MODE = os.getenv("AI_DEBUG_MODE", "false").lower() == "true"
|
||||
|
||||
# 检查配置是否齐全
|
||||
if not all([BASE_URL, MODEL_NAME]):
|
||||
@@ -317,6 +319,10 @@ async def _parse_search_results_json(json_data: dict, source: str) -> list:
|
||||
items = await safe_get(json_data, "data", "resultList", default=[])
|
||||
if not items:
|
||||
print(f"LOG: ({source}) API响应中未找到商品列表 (resultList)。")
|
||||
if AI_DEBUG_MODE:
|
||||
print(f"--- [SEARCH DEBUG] RAW JSON RESPONSE from {source} ---")
|
||||
print(json.dumps(json_data, ensure_ascii=False, indent=2))
|
||||
print("----------------------------------------------------")
|
||||
return []
|
||||
|
||||
for item in items:
|
||||
@@ -588,6 +594,14 @@ async def get_ai_analysis(product_data, image_paths=None, prompt_text=""):
|
||||
product_details_json = json.dumps(product_data, ensure_ascii=False, indent=2)
|
||||
system_prompt = prompt_text
|
||||
|
||||
if AI_DEBUG_MODE:
|
||||
print("\n--- [AI DEBUG] ---")
|
||||
print("--- PROMPT TEXT (first 500 chars) ---")
|
||||
print(prompt_text[:500] + "...")
|
||||
print("--- PRODUCT DATA (JSON) ---")
|
||||
print(product_details_json)
|
||||
print("-------------------\n")
|
||||
|
||||
combined_text_prompt = f"""{system_prompt}
|
||||
|
||||
请基于你的专业知识和我的要求,分析以下完整的商品JSON数据:
|
||||
@@ -614,6 +628,12 @@ async def get_ai_analysis(product_data, image_paths=None, prompt_text=""):
|
||||
|
||||
ai_response_content = response.choices[0].message.content
|
||||
|
||||
if AI_DEBUG_MODE:
|
||||
print("\n--- [AI DEBUG] ---")
|
||||
print("--- RAW AI RESPONSE ---")
|
||||
print(ai_response_content)
|
||||
print("---------------------\n")
|
||||
|
||||
try:
|
||||
# --- 新增代码:从Markdown代码块中提取JSON ---
|
||||
# 寻找第一个 "{" 和最后一个 "}" 来捕获完整的JSON对象
|
||||
@@ -670,7 +690,11 @@ async def scrape_xianyu(task_config: dict, debug_limit: int = 0):
|
||||
print(f"LOG: 输出文件 {output_filename} 不存在,将创建新文件。")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=RUN_HEADLESS)
|
||||
if LOGIN_IS_EDGE:
|
||||
browser = await p.chromium.launch(headless=RUN_HEADLESS, channel="msedge")
|
||||
else:
|
||||
# 明确指定使用系统安装的 Chrome 浏览器,以绕过 Playwright 的内部浏览器管理问题
|
||||
browser = await p.chromium.launch(headless=RUN_HEADLESS, channel="chrome")
|
||||
context = await browser.new_context(storage_state=STATE_FILE, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
|
||||
page = await context.new_page()
|
||||
|
||||
@@ -906,6 +930,15 @@ async def scrape_xianyu(task_config: dict, debug_limit: int = 0):
|
||||
# --- 修改: 增加单个商品处理后的主要延迟 ---
|
||||
print(" [反爬] 执行一次主要的随机延迟以模拟用户浏览间隔...")
|
||||
await random_sleep(15, 30) # 原来是 (8, 15),这是最重要的修改之一
|
||||
else:
|
||||
print(f" 错误: 获取商品详情API响应失败,状态码: {detail_response.status}")
|
||||
if AI_DEBUG_MODE:
|
||||
print(f"--- [DETAIL DEBUG] FAILED RESPONSE from {item_data['商品链接']} ---")
|
||||
try:
|
||||
print(await detail_response.text())
|
||||
except Exception as e:
|
||||
print(f"无法读取响应内容: {e}")
|
||||
print("----------------------------------------------------")
|
||||
|
||||
except PlaywrightTimeoutError:
|
||||
print(f" 错误: 访问商品详情页或等待API响应超时。")
|
||||
|
||||
Reference in New Issue
Block a user