mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
feat: 知乎支持详情模式
This commit is contained in:
@@ -32,7 +32,7 @@
|
||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 知乎 | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
|
||||
### MediaCrawlerPro重磅发布啦!!!
|
||||
> 主打学习成熟项目的架构设计,不仅仅是爬虫,Pro中的其他代码设计思路也是值得学习,欢迎大家关注!!!
|
||||
@@ -111,7 +111,9 @@
|
||||
> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
|
||||
>
|
||||
|
||||
# 知识付费服务
|
||||
# 作者提供的知识服务
|
||||
> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
|
||||
|
||||
[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
|
||||
|
||||
# 项目微信交流群
|
||||
|
||||
@@ -162,6 +162,13 @@ ZHIHU_CREATOR_URL_LIST = [
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定知乎需要爬取的帖子ID列表
|
||||
ZHIHU_SPECIFIED_ID_LIST = [
|
||||
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
|
||||
"https://zhuanlan.zhihu.com/p/673461588", # 文章
|
||||
"https://www.zhihu.com/zvideo/1539542068422144000" # 视频
|
||||
]
|
||||
|
||||
# 词云相关
|
||||
# 是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
|
||||
@@ -11,7 +11,9 @@
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
ZHIHU_URL = "https://www.zhihu.com"
|
||||
ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
|
||||
|
||||
ANSWER_NAME = "answer"
|
||||
ARTICLE_NAME = "article"
|
||||
VIDEO_NAME = "zvideo"
|
||||
VIDEO_NAME = "zvideo"
|
||||
|
||||
|
||||
@@ -121,7 +121,12 @@ class ZhiHuClient(AbstractApiClient):
|
||||
if isinstance(params, dict):
|
||||
final_uri += '?' + urlencode(params)
|
||||
headers = await self._pre_headers(final_uri)
|
||||
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
|
||||
base_url = (
|
||||
zhihu_constant.ZHIHU_URL
|
||||
if "/p/" not in uri
|
||||
else zhihu_constant.ZHIHU_ZHUANLAN_URL
|
||||
)
|
||||
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
@@ -209,7 +214,7 @@ class ZhiHuClient(AbstractApiClient):
|
||||
return self._extractor.extract_contents_from_search(search_res)
|
||||
|
||||
async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
|
||||
order_by: str = "sort") -> Dict:
|
||||
order_by: str = "score") -> Dict:
|
||||
"""
|
||||
获取内容的一级评论
|
||||
Args:
|
||||
@@ -222,13 +227,16 @@ class ZhiHuClient(AbstractApiClient):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
|
||||
params = {
|
||||
"order": order_by,
|
||||
"offset": offset,
|
||||
"limit": limit
|
||||
}
|
||||
uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
|
||||
params = {"order": order_by, "offset": offset, "limit": limit}
|
||||
return await self.get(uri, params)
|
||||
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
|
||||
# params = {
|
||||
# "order": order_by,
|
||||
# "offset": offset,
|
||||
# "limit": limit
|
||||
# }
|
||||
# return await self.get(uri, params)
|
||||
|
||||
async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
|
||||
order_by: str = "sort") -> Dict:
|
||||
@@ -496,3 +504,46 @@ class ZhiHuClient(AbstractApiClient):
|
||||
offset += limit
|
||||
await asyncio.sleep(crawl_interval)
|
||||
return all_contents
|
||||
|
||||
|
||||
async def get_answer_info(
|
||||
self, question_id: str, answer_id: str
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取回答信息
|
||||
Args:
|
||||
question_id:
|
||||
answer_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/question/{question_id}/answer/{answer_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_answer_content_from_html(response_html)
|
||||
|
||||
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取文章信息
|
||||
Args:
|
||||
article_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{article_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_article_content_from_html(response_html)
|
||||
|
||||
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
获取视频信息
|
||||
Args:
|
||||
video_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/zvideo/{video_id}"
|
||||
response_html = await self.get(uri, return_response=True)
|
||||
return self._extractor.extract_zvideo_content_from_html(response_html)
|
||||
|
||||
@@ -14,12 +14,13 @@ import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple, cast
|
||||
|
||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||
async_playwright)
|
||||
|
||||
import config
|
||||
from constant import zhihu as constant
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_zhihu import ZhihuContent, ZhihuCreator
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
@@ -29,7 +30,7 @@ from var import crawler_type_var, source_keyword_var
|
||||
|
||||
from .client import ZhiHuClient
|
||||
from .exception import DataFetchError
|
||||
from .help import ZhihuExtractor
|
||||
from .help import ZhihuExtractor, judge_zhihu_url
|
||||
from .login import ZhiHuLogin
|
||||
|
||||
|
||||
@@ -96,7 +97,7 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
raise NotImplementedError
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
@@ -226,6 +227,76 @@ class ZhihuCrawler(AbstractCrawler):
|
||||
# Get all comments of the creator's contents
|
||||
await self.batch_get_content_comments(all_content_list)
|
||||
|
||||
async def get_note_detail(
|
||||
self, full_note_url: str, semaphore: asyncio.Semaphore
|
||||
) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
full_note_url: str
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
|
||||
)
|
||||
# judge note type
|
||||
note_type: str = judge_zhihu_url(full_note_url)
|
||||
if note_type == constant.ANSWER_NAME:
|
||||
question_id = full_note_url.split("/")[-3]
|
||||
answer_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_answer_info(question_id, answer_id)
|
||||
|
||||
elif note_type == constant.ARTICLE_NAME:
|
||||
article_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_article_info(article_id)
|
||||
|
||||
elif note_type == constant.VIDEO_NAME:
|
||||
video_id = full_note_url.split("/")[-1]
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
|
||||
)
|
||||
return await self.zhihu_client.get_video_info(video_id)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Returns:
|
||||
|
||||
"""
|
||||
get_note_detail_task_list = []
|
||||
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
|
||||
# remove query params
|
||||
full_note_url = full_note_url.split("?")[0]
|
||||
crawler_task = self.get_note_detail(
|
||||
full_note_url=full_note_url,
|
||||
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
|
||||
)
|
||||
get_note_detail_task_list.append(crawler_task)
|
||||
|
||||
need_get_comment_notes: List[ZhihuContent] = []
|
||||
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||
for index, note_detail in enumerate(note_details):
|
||||
if not note_detail:
|
||||
utils.logger.info(
|
||||
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
note_detail = cast(ZhihuContent, note_detail) # only for type check
|
||||
need_get_comment_notes.append(note_detail)
|
||||
await zhihu_store.update_zhihu_content(note_detail)
|
||||
|
||||
await self.batch_get_content_comments(need_get_comment_notes)
|
||||
|
||||
@staticmethod
|
||||
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||
|
||||
@@ -159,15 +159,13 @@ class ZhihuExtractor:
|
||||
res = ZhihuContent()
|
||||
|
||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
||||
res.content_id = zvideo.get("video").get("video_id")
|
||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
||||
res.created_time = zvideo.get("published_at")
|
||||
res.updated_time = zvideo.get("updated_at")
|
||||
else:
|
||||
res.content_id = zvideo.get("zvideo_id")
|
||||
res.content_url = zvideo.get("video_url")
|
||||
res.created_time = zvideo.get("created_at")
|
||||
|
||||
res.content_id = zvideo.get("id")
|
||||
res.content_type = zvideo.get("type")
|
||||
res.title = extract_text_from_html(zvideo.get("title"))
|
||||
res.desc = extract_text_from_html(zvideo.get("description"))
|
||||
@@ -369,3 +367,94 @@ class ZhihuExtractor:
|
||||
return []
|
||||
|
||||
return self._extract_content_list(anwser_list)
|
||||
|
||||
|
||||
|
||||
|
||||
def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu answer content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
|
||||
if not answer_info:
|
||||
return None
|
||||
|
||||
return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
|
||||
|
||||
def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu article content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
|
||||
if not article_info:
|
||||
return None
|
||||
|
||||
return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
|
||||
|
||||
def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||
"""
|
||||
extract zhihu zvideo content from html
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||
if not js_init_data:
|
||||
return None
|
||||
json_data: Dict = json.loads(js_init_data)
|
||||
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
|
||||
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
|
||||
if not zvideo_info:
|
||||
return None
|
||||
|
||||
# handler user info and video info
|
||||
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
|
||||
if not video_detail_info:
|
||||
return None
|
||||
if isinstance(video_detail_info.get("author"), str):
|
||||
author_name: str = video_detail_info.get("author")
|
||||
video_detail_info["author"] = users.get(author_name)
|
||||
|
||||
return self._extract_zvideo_content(video_detail_info)
|
||||
|
||||
|
||||
def judge_zhihu_url(note_detail_url: str) -> str:
|
||||
"""
|
||||
judge zhihu url type
|
||||
Args:
|
||||
note_detail_url:
|
||||
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
|
||||
eg2: https://www.zhihu.com/p/123456789 # article
|
||||
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if "/answer/" in note_detail_url:
|
||||
return zhihu_constant.ANSWER_NAME
|
||||
elif "/p/" in note_detail_url:
|
||||
return zhihu_constant.ARTICLE_NAME
|
||||
elif "/zvideo/" in note_detail_url:
|
||||
return zhihu_constant.VIDEO_NAME
|
||||
else:
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user