feat: 知乎支持详情模式

2025-11-25 11:29:27 +08:00 · 2024-12-26 17:36:33 +08:00
parent dc9116e098
commit ea5223c708
6 changed files with 239 additions and 17 deletions
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@
 | B 站   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
 | 微博   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
 | 贴吧   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
-| 知乎   | ✅          | ❌              | ✅        | ✅              | ✅          | ✅        | ✅              |
+| 知乎   | ✅          | ✅              | ✅        | ✅              | ✅          | ✅        | ✅              |
 ### MediaCrawlerPro重磅发布啦！！！
 > 主打学习成熟项目的架构设计，不仅仅是爬虫，Pro中的其他代码设计思路也是值得学习，欢迎大家关注！！！
@@ -111,7 +111,9 @@
 > [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
 > 
-# 知识付费服务
+# 作者提供的知识服务
 > 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
 [作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
 # 项目微信交流群
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -162,6 +162,13 @@ ZHIHU_CREATOR_URL_LIST = [
    # ........................
 ]
 # 指定知乎需要爬取的帖子ID列表
 ZHIHU_SPECIFIED_ID_LIST = [
    "https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
    "https://zhuanlan.zhihu.com/p/673461588", # 文章
    "https://www.zhihu.com/zvideo/1539542068422144000" # 视频
 ]
 # 词云相关
 # 是否开启生成评论词云图
 ENABLE_GET_WORDCLOUD = False
--- a/constant/zhihu.py
+++ b/constant/zhihu.py
@@ -11,7 +11,9 @@
 # -*- coding: utf-8 -*-
 ZHIHU_URL = "https://www.zhihu.com"
 ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
 ANSWER_NAME = "answer"
 ARTICLE_NAME = "article"
-VIDEO_NAME = "zvideo"
+VIDEO_NAME = "zvideo"
--- a/media_platform/zhihu/client.py
+++ b/media_platform/zhihu/client.py
@@ -121,7 +121,12 @@ class ZhiHuClient(AbstractApiClient):
        if isinstance(params, dict):
            final_uri += '?' + urlencode(params)
        headers = await self._pre_headers(final_uri)
-        return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
+        base_url = (
            zhihu_constant.ZHIHU_URL
            if "/p/" not in uri
            else zhihu_constant.ZHIHU_ZHUANLAN_URL
        )
        return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
    async def pong(self) -> bool:
        """
@@ -209,7 +214,7 @@ class ZhiHuClient(AbstractApiClient):
        return self._extractor.extract_contents_from_search(search_res)
    async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
-                                order_by: str = "sort") -> Dict:
+                                order_by: str = "score") -> Dict:
        """
        获取内容的一级评论
        Args:
@@ -222,13 +227,16 @@ class ZhiHuClient(AbstractApiClient):
        Returns:
        """
-        uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
+        uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
-        params = {
+        params = {"order": order_by, "offset": offset, "limit": limit}
            "order": order_by,
            "offset": offset,
            "limit": limit
        }
        return await self.get(uri, params)
        # uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
        # params = {
        #     "order": order_by,
        #     "offset": offset,
        #     "limit": limit
        # }
        # return await self.get(uri, params)
    async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
                                 order_by: str = "sort") -> Dict:
@@ -496,3 +504,46 @@ class ZhiHuClient(AbstractApiClient):
            offset += limit
            await asyncio.sleep(crawl_interval)
        return all_contents
    async def get_answer_info(
        self, question_id: str, answer_id: str
    ) -> Optional[ZhihuContent]:
        """
        获取回答信息
        Args:
            question_id:
            answer_id:
        Returns:
        """
        uri = f"/question/{question_id}/answer/{answer_id}"
        response_html = await self.get(uri, return_response=True)
        return self._extractor.extract_answer_content_from_html(response_html)
    async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
        """
        获取文章信息
        Args:
            article_id:
        Returns:
        """
        uri = f"/p/{article_id}"
        response_html = await self.get(uri, return_response=True)
        return self._extractor.extract_article_content_from_html(response_html)
    async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
        """
        获取视频信息
        Args:
            video_id:
        Returns:
        """
        uri = f"/zvideo/{video_id}"
        response_html = await self.get(uri, return_response=True)
        return self._extractor.extract_zvideo_content_from_html(response_html)
--- a/media_platform/zhihu/core.py
+++ b/media_platform/zhihu/core.py
@@ -14,12 +14,13 @@ import asyncio
 import os
 import random
 from asyncio import Task
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, cast
 from playwright.async_api import (BrowserContext, BrowserType, Page,
                                  async_playwright)
 import config
 from constant import zhihu as constant
 from base.base_crawler import AbstractCrawler
 from model.m_zhihu import ZhihuContent, ZhihuCreator
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
@@ -29,7 +30,7 @@ from var import crawler_type_var, source_keyword_var
 from .client import ZhiHuClient
 from .exception import DataFetchError
-from .help import ZhihuExtractor
+from .help import ZhihuExtractor, judge_zhihu_url
 from .login import ZhiHuLogin
@@ -96,7 +97,7 @@ class ZhihuCrawler(AbstractCrawler):
                await self.search()
            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
-                raise NotImplementedError
+                await self.get_specified_notes()
            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
                await self.get_creators_and_notes()
@@ -226,6 +227,76 @@ class ZhihuCrawler(AbstractCrawler):
            # Get all comments of the creator's contents
            await self.batch_get_content_comments(all_content_list)
    async def get_note_detail(
        self, full_note_url: str, semaphore: asyncio.Semaphore
    ) -> Optional[ZhihuContent]:
        """
        Get note detail
        Args:
            full_note_url: str
            semaphore:
        Returns:
        """
        async with semaphore:
            utils.logger.info(
                f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
            )
            # judge note type
            note_type: str = judge_zhihu_url(full_note_url)
            if note_type == constant.ANSWER_NAME:
                question_id = full_note_url.split("/")[-3]
                answer_id = full_note_url.split("/")[-1]
                utils.logger.info(
                    f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
                )
                return await self.zhihu_client.get_answer_info(question_id, answer_id)
            elif note_type == constant.ARTICLE_NAME:
                article_id = full_note_url.split("/")[-1]
                utils.logger.info(
                    f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
                )
                return await self.zhihu_client.get_article_info(article_id)
            elif note_type == constant.VIDEO_NAME:
                video_id = full_note_url.split("/")[-1]
                utils.logger.info(
                    f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
                )
                return await self.zhihu_client.get_video_info(video_id)
    async def get_specified_notes(self):
        """
        Get the information and comments of the specified post
        Returns:
        """
        get_note_detail_task_list = []
        for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
            # remove query params
            full_note_url = full_note_url.split("?")[0]
            crawler_task = self.get_note_detail(
                full_note_url=full_note_url,
                semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
            )
            get_note_detail_task_list.append(crawler_task)
        need_get_comment_notes: List[ZhihuContent] = []
        note_details = await asyncio.gather(*get_note_detail_task_list)
        for index, note_detail in enumerate(note_details):
            if not note_detail:
                utils.logger.info(
                    f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
                )
                continue
            note_detail = cast(ZhihuContent, note_detail)  # only for type check
            need_get_comment_notes.append(note_detail)
            await zhihu_store.update_zhihu_content(note_detail)
        await self.batch_get_content_comments(need_get_comment_notes)
    @staticmethod
    def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
--- a/media_platform/zhihu/help.py
+++ b/media_platform/zhihu/help.py
@@ -159,15 +159,13 @@ class ZhihuExtractor:
        res = ZhihuContent()
        if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
            res.content_id = zvideo.get("video").get("video_id")
            res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
            res.created_time = zvideo.get("published_at")
            res.updated_time = zvideo.get("updated_at")
        else:
            res.content_id = zvideo.get("zvideo_id")
            res.content_url = zvideo.get("video_url")
            res.created_time = zvideo.get("created_at")
-
+        res.content_id = zvideo.get("id")
        res.content_type = zvideo.get("type")
        res.title = extract_text_from_html(zvideo.get("title"))
        res.desc = extract_text_from_html(zvideo.get("description"))
@@ -369,3 +367,94 @@ class ZhihuExtractor:
            return []
        return self._extract_content_list(anwser_list)
    def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
        """
        extract zhihu answer content from html
        Args:
            html_content:
        Returns:
        """
        js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
        if not js_init_data:
            return None
        json_data: Dict = json.loads(js_init_data)
        answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
        if not answer_info:
            return None
        return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
    def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
        """
        extract zhihu article content from html
        Args:
            html_content:
        Returns:
        """
        js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
        if not js_init_data:
            return None
        json_data: Dict = json.loads(js_init_data)
        article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
        if not article_info:
            return None
        return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
    def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
        """
        extract zhihu zvideo content from html
        Args:
            html_content:
        Returns:
        """
        js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
        if not js_init_data:
            return None
        json_data: Dict = json.loads(js_init_data)
        zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
        users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
        if not zvideo_info:
            return None
        # handler user info and video info
        video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
        if not video_detail_info:
            return None
        if isinstance(video_detail_info.get("author"), str):
            author_name: str = video_detail_info.get("author")
            video_detail_info["author"] = users.get(author_name)
        return self._extract_zvideo_content(video_detail_info)
 def judge_zhihu_url(note_detail_url: str) -> str:
    """
    judge zhihu url type
    Args:
        note_detail_url:
            eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
            eg2: https://www.zhihu.com/p/123456789 # article
            eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
    Returns:
    """
    if "/answer/" in note_detail_url:
        return zhihu_constant.ANSWER_NAME
    elif "/p/" in note_detail_url:
        return zhihu_constant.ARTICLE_NAME
    elif "/zvideo/" in note_detail_url:
        return zhihu_constant.VIDEO_NAME
    else:
        return ""