feat: 知乎支持详情模式

This commit is contained in:
Relakkes
2024-12-26 17:36:33 +08:00
parent dc9116e098
commit ea5223c708
6 changed files with 239 additions and 17 deletions

View File

@@ -32,7 +32,7 @@
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 知乎 | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | | 知乎 | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ |
### MediaCrawlerPro重磅发布啦 ### MediaCrawlerPro重磅发布啦
> 主打学习成熟项目的架构设计不仅仅是爬虫Pro中的其他代码设计思路也是值得学习欢迎大家关注 > 主打学习成熟项目的架构设计不仅仅是爬虫Pro中的其他代码设计思路也是值得学习欢迎大家关注
@@ -111,7 +111,9 @@
> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/) > [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
> >
# 知识付费服务 # 作者提供的知识服务
> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html) [作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
# 项目微信交流群 # 项目微信交流群

View File

@@ -162,6 +162,13 @@ ZHIHU_CREATOR_URL_LIST = [
# ........................ # ........................
] ]
# 指定知乎需要爬取的帖子ID列表
ZHIHU_SPECIFIED_ID_LIST = [
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
"https://zhuanlan.zhihu.com/p/673461588", # 文章
"https://www.zhihu.com/zvideo/1539542068422144000" # 视频
]
# 词云相关 # 词云相关
# 是否开启生成评论词云图 # 是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False ENABLE_GET_WORDCLOUD = False

View File

@@ -11,7 +11,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
ZHIHU_URL = "https://www.zhihu.com" ZHIHU_URL = "https://www.zhihu.com"
ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
ANSWER_NAME = "answer" ANSWER_NAME = "answer"
ARTICLE_NAME = "article" ARTICLE_NAME = "article"
VIDEO_NAME = "zvideo" VIDEO_NAME = "zvideo"

View File

@@ -121,7 +121,12 @@ class ZhiHuClient(AbstractApiClient):
if isinstance(params, dict): if isinstance(params, dict):
final_uri += '?' + urlencode(params) final_uri += '?' + urlencode(params)
headers = await self._pre_headers(final_uri) headers = await self._pre_headers(final_uri)
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs) base_url = (
zhihu_constant.ZHIHU_URL
if "/p/" not in uri
else zhihu_constant.ZHIHU_ZHUANLAN_URL
)
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
async def pong(self) -> bool: async def pong(self) -> bool:
""" """
@@ -209,7 +214,7 @@ class ZhiHuClient(AbstractApiClient):
return self._extractor.extract_contents_from_search(search_res) return self._extractor.extract_contents_from_search(search_res)
async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10, async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
order_by: str = "sort") -> Dict: order_by: str = "score") -> Dict:
""" """
获取内容的一级评论 获取内容的一级评论
Args: Args:
@@ -222,13 +227,16 @@ class ZhiHuClient(AbstractApiClient):
Returns: Returns:
""" """
uri = f"/api/v4/{content_type}s/{content_id}/root_comments" uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
params = { params = {"order": order_by, "offset": offset, "limit": limit}
"order": order_by,
"offset": offset,
"limit": limit
}
return await self.get(uri, params) return await self.get(uri, params)
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
# params = {
# "order": order_by,
# "offset": offset,
# "limit": limit
# }
# return await self.get(uri, params)
async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10, async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
order_by: str = "sort") -> Dict: order_by: str = "sort") -> Dict:
@@ -496,3 +504,46 @@ class ZhiHuClient(AbstractApiClient):
offset += limit offset += limit
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
return all_contents return all_contents
async def get_answer_info(
self, question_id: str, answer_id: str
) -> Optional[ZhihuContent]:
"""
获取回答信息
Args:
question_id:
answer_id:
Returns:
"""
uri = f"/question/{question_id}/answer/{answer_id}"
response_html = await self.get(uri, return_response=True)
return self._extractor.extract_answer_content_from_html(response_html)
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
"""
获取文章信息
Args:
article_id:
Returns:
"""
uri = f"/p/{article_id}"
response_html = await self.get(uri, return_response=True)
return self._extractor.extract_article_content_from_html(response_html)
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
"""
获取视频信息
Args:
video_id:
Returns:
"""
uri = f"/zvideo/{video_id}"
response_html = await self.get(uri, return_response=True)
return self._extractor.extract_zvideo_content_from_html(response_html)

View File

@@ -14,12 +14,13 @@ import asyncio
import os import os
import random import random
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple, cast
from playwright.async_api import (BrowserContext, BrowserType, Page, from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright) async_playwright)
import config import config
from constant import zhihu as constant
from base.base_crawler import AbstractCrawler from base.base_crawler import AbstractCrawler
from model.m_zhihu import ZhihuContent, ZhihuCreator from model.m_zhihu import ZhihuContent, ZhihuCreator
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
@@ -29,7 +30,7 @@ from var import crawler_type_var, source_keyword_var
from .client import ZhiHuClient from .client import ZhiHuClient
from .exception import DataFetchError from .exception import DataFetchError
from .help import ZhihuExtractor from .help import ZhihuExtractor, judge_zhihu_url
from .login import ZhiHuLogin from .login import ZhiHuLogin
@@ -96,7 +97,7 @@ class ZhihuCrawler(AbstractCrawler):
await self.search() await self.search()
elif config.CRAWLER_TYPE == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
raise NotImplementedError await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator": elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments # Get creator's information and their notes and comments
await self.get_creators_and_notes() await self.get_creators_and_notes()
@@ -226,6 +227,76 @@ class ZhihuCrawler(AbstractCrawler):
# Get all comments of the creator's contents # Get all comments of the creator's contents
await self.batch_get_content_comments(all_content_list) await self.batch_get_content_comments(all_content_list)
async def get_note_detail(
self, full_note_url: str, semaphore: asyncio.Semaphore
) -> Optional[ZhihuContent]:
"""
Get note detail
Args:
full_note_url: str
semaphore:
Returns:
"""
async with semaphore:
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
)
# judge note type
note_type: str = judge_zhihu_url(full_note_url)
if note_type == constant.ANSWER_NAME:
question_id = full_note_url.split("/")[-3]
answer_id = full_note_url.split("/")[-1]
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
)
return await self.zhihu_client.get_answer_info(question_id, answer_id)
elif note_type == constant.ARTICLE_NAME:
article_id = full_note_url.split("/")[-1]
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
)
return await self.zhihu_client.get_article_info(article_id)
elif note_type == constant.VIDEO_NAME:
video_id = full_note_url.split("/")[-1]
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
)
return await self.zhihu_client.get_video_info(video_id)
async def get_specified_notes(self):
"""
Get the information and comments of the specified post
Returns:
"""
get_note_detail_task_list = []
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
# remove query params
full_note_url = full_note_url.split("?")[0]
crawler_task = self.get_note_detail(
full_note_url=full_note_url,
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
)
get_note_detail_task_list.append(crawler_task)
need_get_comment_notes: List[ZhihuContent] = []
note_details = await asyncio.gather(*get_note_detail_task_list)
for index, note_detail in enumerate(note_details):
if not note_detail:
utils.logger.info(
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
)
continue
note_detail = cast(ZhihuContent, note_detail) # only for type check
need_get_comment_notes.append(note_detail)
await zhihu_store.update_zhihu_content(note_detail)
await self.batch_get_content_comments(need_get_comment_notes)
@staticmethod @staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:

View File

@@ -159,15 +159,13 @@ class ZhihuExtractor:
res = ZhihuContent() res = ZhihuContent()
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的 if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
res.content_id = zvideo.get("video").get("video_id")
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}" res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
res.created_time = zvideo.get("published_at") res.created_time = zvideo.get("published_at")
res.updated_time = zvideo.get("updated_at") res.updated_time = zvideo.get("updated_at")
else: else:
res.content_id = zvideo.get("zvideo_id")
res.content_url = zvideo.get("video_url") res.content_url = zvideo.get("video_url")
res.created_time = zvideo.get("created_at") res.created_time = zvideo.get("created_at")
res.content_id = zvideo.get("id")
res.content_type = zvideo.get("type") res.content_type = zvideo.get("type")
res.title = extract_text_from_html(zvideo.get("title")) res.title = extract_text_from_html(zvideo.get("title"))
res.desc = extract_text_from_html(zvideo.get("description")) res.desc = extract_text_from_html(zvideo.get("description"))
@@ -369,3 +367,94 @@ class ZhihuExtractor:
return [] return []
return self._extract_content_list(anwser_list) return self._extract_content_list(anwser_list)
def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
"""
extract zhihu answer content from html
Args:
html_content:
Returns:
"""
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
if not js_init_data:
return None
json_data: Dict = json.loads(js_init_data)
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
if not answer_info:
return None
return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
"""
extract zhihu article content from html
Args:
html_content:
Returns:
"""
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
if not js_init_data:
return None
json_data: Dict = json.loads(js_init_data)
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
if not article_info:
return None
return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
"""
extract zhihu zvideo content from html
Args:
html_content:
Returns:
"""
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
if not js_init_data:
return None
json_data: Dict = json.loads(js_init_data)
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
if not zvideo_info:
return None
# handler user info and video info
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
if not video_detail_info:
return None
if isinstance(video_detail_info.get("author"), str):
author_name: str = video_detail_info.get("author")
video_detail_info["author"] = users.get(author_name)
return self._extract_zvideo_content(video_detail_info)
def judge_zhihu_url(note_detail_url: str) -> str:
"""
judge zhihu url type
Args:
note_detail_url:
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
eg2: https://www.zhihu.com/p/123456789 # article
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
Returns:
"""
if "/answer/" in note_detail_url:
return zhihu_constant.ANSWER_NAME
elif "/p/" in note_detail_url:
return zhihu_constant.ARTICLE_NAME
elif "/zvideo/" in note_detail_url:
return zhihu_constant.VIDEO_NAME
else:
return ""