From ea5223c708508a3b3845feec0a2ecf03481710f5 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Thu, 26 Dec 2024 17:36:33 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E7=9F=A5=E4=B9=8E=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E8=AF=A6=E6=83=85=E6=A8=A1=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 6 ++- config/base_config.py | 7 +++ constant/zhihu.py | 4 +- media_platform/zhihu/client.py | 67 +++++++++++++++++++++--- media_platform/zhihu/core.py | 77 +++++++++++++++++++++++++-- media_platform/zhihu/help.py | 95 ++++++++++++++++++++++++++++++++-- 6 files changed, 239 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 8ed52f9..f8b14a3 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 知乎 | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ### MediaCrawlerPro重磅发布啦!!! > 主打学习成熟项目的架构设计,不仅仅是爬虫,Pro中的其他代码设计思路也是值得学习,欢迎大家关注!!! @@ -111,7 +111,9 @@ > [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/) > -# 知识付费服务 +# 作者提供的知识服务 +> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。 + [作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html) # 项目微信交流群 diff --git a/config/base_config.py b/config/base_config.py index d5a2b0c..78a3bb5 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -162,6 +162,13 @@ ZHIHU_CREATOR_URL_LIST = [ # ........................ ] +# 指定知乎需要爬取的帖子ID列表 +ZHIHU_SPECIFIED_ID_LIST = [ + "https://www.zhihu.com/question/826896610/answer/4885821440", # 回答 + "https://zhuanlan.zhihu.com/p/673461588", # 文章 + "https://www.zhihu.com/zvideo/1539542068422144000" # 视频 +] + # 词云相关 # 是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False diff --git a/constant/zhihu.py b/constant/zhihu.py index 495ba6a..2a52667 100644 --- a/constant/zhihu.py +++ b/constant/zhihu.py @@ -11,7 +11,9 @@ # -*- coding: utf-8 -*- ZHIHU_URL = "https://www.zhihu.com" +ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com" ANSWER_NAME = "answer" ARTICLE_NAME = "article" -VIDEO_NAME = "zvideo" \ No newline at end of file +VIDEO_NAME = "zvideo" + diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py index 1e11d7c..5991163 100644 --- a/media_platform/zhihu/client.py +++ b/media_platform/zhihu/client.py @@ -121,7 +121,12 @@ class ZhiHuClient(AbstractApiClient): if isinstance(params, dict): final_uri += '?' + urlencode(params) headers = await self._pre_headers(final_uri) - return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs) + base_url = ( + zhihu_constant.ZHIHU_URL + if "/p/" not in uri + else zhihu_constant.ZHIHU_ZHUANLAN_URL + ) + return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs) async def pong(self) -> bool: """ @@ -209,7 +214,7 @@ class ZhiHuClient(AbstractApiClient): return self._extractor.extract_contents_from_search(search_res) async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10, - order_by: str = "sort") -> Dict: + order_by: str = "score") -> Dict: """ 获取内容的一级评论 Args: @@ -222,13 +227,16 @@ class ZhiHuClient(AbstractApiClient): Returns: """ - uri = f"/api/v4/{content_type}s/{content_id}/root_comments" - params = { - "order": order_by, - "offset": offset, - "limit": limit - } + uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment" + params = {"order": order_by, "offset": offset, "limit": limit} return await self.get(uri, params) + # uri = f"/api/v4/{content_type}s/{content_id}/root_comments" + # params = { + # "order": order_by, + # "offset": offset, + # "limit": limit + # } + # return await self.get(uri, params) async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10, order_by: str = "sort") -> Dict: @@ -496,3 +504,46 @@ class ZhiHuClient(AbstractApiClient): offset += limit await asyncio.sleep(crawl_interval) return all_contents + + + async def get_answer_info( + self, question_id: str, answer_id: str + ) -> Optional[ZhihuContent]: + """ + 获取回答信息 + Args: + question_id: + answer_id: + + Returns: + + """ + uri = f"/question/{question_id}/answer/{answer_id}" + response_html = await self.get(uri, return_response=True) + return self._extractor.extract_answer_content_from_html(response_html) + + async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]: + """ + 获取文章信息 + Args: + article_id: + + Returns: + + """ + uri = f"/p/{article_id}" + response_html = await self.get(uri, return_response=True) + return self._extractor.extract_article_content_from_html(response_html) + + async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]: + """ + 获取视频信息 + Args: + video_id: + + Returns: + + """ + uri = f"/zvideo/{video_id}" + response_html = await self.get(uri, return_response=True) + return self._extractor.extract_zvideo_content_from_html(response_html) diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py index 597b602..b654003 100644 --- a/media_platform/zhihu/core.py +++ b/media_platform/zhihu/core.py @@ -14,12 +14,13 @@ import asyncio import os import random from asyncio import Task -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, cast from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) import config +from constant import zhihu as constant from base.base_crawler import AbstractCrawler from model.m_zhihu import ZhihuContent, ZhihuCreator from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool @@ -29,7 +30,7 @@ from var import crawler_type_var, source_keyword_var from .client import ZhiHuClient from .exception import DataFetchError -from .help import ZhihuExtractor +from .help import ZhihuExtractor, judge_zhihu_url from .login import ZhiHuLogin @@ -96,7 +97,7 @@ class ZhihuCrawler(AbstractCrawler): await self.search() elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post - raise NotImplementedError + await self.get_specified_notes() elif config.CRAWLER_TYPE == "creator": # Get creator's information and their notes and comments await self.get_creators_and_notes() @@ -226,6 +227,76 @@ class ZhihuCrawler(AbstractCrawler): # Get all comments of the creator's contents await self.batch_get_content_comments(all_content_list) + async def get_note_detail( + self, full_note_url: str, semaphore: asyncio.Semaphore + ) -> Optional[ZhihuContent]: + """ + Get note detail + Args: + full_note_url: str + semaphore: + + Returns: + + """ + async with semaphore: + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}" + ) + # judge note type + note_type: str = judge_zhihu_url(full_note_url) + if note_type == constant.ANSWER_NAME: + question_id = full_note_url.split("/")[-3] + answer_id = full_note_url.split("/")[-1] + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}" + ) + return await self.zhihu_client.get_answer_info(question_id, answer_id) + + elif note_type == constant.ARTICLE_NAME: + article_id = full_note_url.split("/")[-1] + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}" + ) + return await self.zhihu_client.get_article_info(article_id) + + elif note_type == constant.VIDEO_NAME: + video_id = full_note_url.split("/")[-1] + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}" + ) + return await self.zhihu_client.get_video_info(video_id) + + async def get_specified_notes(self): + """ + Get the information and comments of the specified post + Returns: + + """ + get_note_detail_task_list = [] + for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST: + # remove query params + full_note_url = full_note_url.split("?")[0] + crawler_task = self.get_note_detail( + full_note_url=full_note_url, + semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM), + ) + get_note_detail_task_list.append(crawler_task) + + need_get_comment_notes: List[ZhihuContent] = [] + note_details = await asyncio.gather(*get_note_detail_task_list) + for index, note_detail in enumerate(note_details): + if not note_detail: + utils.logger.info( + f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found" + ) + continue + + note_detail = cast(ZhihuContent, note_detail) # only for type check + need_get_comment_notes.append(note_detail) + await zhihu_store.update_zhihu_content(note_detail) + + await self.batch_get_content_comments(need_get_comment_notes) @staticmethod def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: diff --git a/media_platform/zhihu/help.py b/media_platform/zhihu/help.py index 83dccd1..418333c 100644 --- a/media_platform/zhihu/help.py +++ b/media_platform/zhihu/help.py @@ -159,15 +159,13 @@ class ZhihuExtractor: res = ZhihuContent() if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的 - res.content_id = zvideo.get("video").get("video_id") res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}" res.created_time = zvideo.get("published_at") res.updated_time = zvideo.get("updated_at") else: - res.content_id = zvideo.get("zvideo_id") res.content_url = zvideo.get("video_url") res.created_time = zvideo.get("created_at") - + res.content_id = zvideo.get("id") res.content_type = zvideo.get("type") res.title = extract_text_from_html(zvideo.get("title")) res.desc = extract_text_from_html(zvideo.get("description")) @@ -369,3 +367,94 @@ class ZhihuExtractor: return [] return self._extract_content_list(anwser_list) + + + + + def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]: + """ + extract zhihu answer content from html + Args: + html_content: + + Returns: + + """ + js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="") + if not js_init_data: + return None + json_data: Dict = json.loads(js_init_data) + answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {}) + if not answer_info: + return None + + return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0])) + + def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]: + """ + extract zhihu article content from html + Args: + html_content: + + Returns: + + """ + js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="") + if not js_init_data: + return None + json_data: Dict = json.loads(js_init_data) + article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {}) + if not article_info: + return None + + return self._extract_article_content(article_info.get(list(article_info.keys())[0])) + + def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]: + """ + extract zhihu zvideo content from html + Args: + html_content: + + Returns: + + """ + js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="") + if not js_init_data: + return None + json_data: Dict = json.loads(js_init_data) + zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {}) + users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {}) + if not zvideo_info: + return None + + # handler user info and video info + video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0]) + if not video_detail_info: + return None + if isinstance(video_detail_info.get("author"), str): + author_name: str = video_detail_info.get("author") + video_detail_info["author"] = users.get(author_name) + + return self._extract_zvideo_content(video_detail_info) + + +def judge_zhihu_url(note_detail_url: str) -> str: + """ + judge zhihu url type + Args: + note_detail_url: + eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer + eg2: https://www.zhihu.com/p/123456789 # article + eg3: https://www.zhihu.com/zvideo/123456789 # zvideo + + Returns: + + """ + if "/answer/" in note_detail_url: + return zhihu_constant.ANSWER_NAME + elif "/p/" in note_detail_url: + return zhihu_constant.ARTICLE_NAME + elif "/zvideo/" in note_detail_url: + return zhihu_constant.VIDEO_NAME + else: + return ""