mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 11:29:27 +08:00
feat: 知乎支持详情模式
This commit is contained in:
@@ -32,7 +32,7 @@
|
|||||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 知乎 | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 知乎 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
|
||||||
### MediaCrawlerPro重磅发布啦!!!
|
### MediaCrawlerPro重磅发布啦!!!
|
||||||
> 主打学习成熟项目的架构设计,不仅仅是爬虫,Pro中的其他代码设计思路也是值得学习,欢迎大家关注!!!
|
> 主打学习成熟项目的架构设计,不仅仅是爬虫,Pro中的其他代码设计思路也是值得学习,欢迎大家关注!!!
|
||||||
@@ -111,7 +111,9 @@
|
|||||||
> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
|
> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
|
||||||
>
|
>
|
||||||
|
|
||||||
# 知识付费服务
|
# 作者提供的知识服务
|
||||||
|
> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
|
||||||
|
|
||||||
[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
|
[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
|
||||||
|
|
||||||
# 项目微信交流群
|
# 项目微信交流群
|
||||||
|
|||||||
@@ -162,6 +162,13 @@ ZHIHU_CREATOR_URL_LIST = [
|
|||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 指定知乎需要爬取的帖子ID列表
|
||||||
|
ZHIHU_SPECIFIED_ID_LIST = [
|
||||||
|
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
|
||||||
|
"https://zhuanlan.zhihu.com/p/673461588", # 文章
|
||||||
|
"https://www.zhihu.com/zvideo/1539542068422144000" # 视频
|
||||||
|
]
|
||||||
|
|
||||||
# 词云相关
|
# 词云相关
|
||||||
# 是否开启生成评论词云图
|
# 是否开启生成评论词云图
|
||||||
ENABLE_GET_WORDCLOUD = False
|
ENABLE_GET_WORDCLOUD = False
|
||||||
|
|||||||
@@ -11,7 +11,9 @@
|
|||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
ZHIHU_URL = "https://www.zhihu.com"
|
ZHIHU_URL = "https://www.zhihu.com"
|
||||||
|
ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
|
||||||
|
|
||||||
ANSWER_NAME = "answer"
|
ANSWER_NAME = "answer"
|
||||||
ARTICLE_NAME = "article"
|
ARTICLE_NAME = "article"
|
||||||
VIDEO_NAME = "zvideo"
|
VIDEO_NAME = "zvideo"
|
||||||
|
|
||||||
|
|||||||
@@ -121,7 +121,12 @@ class ZhiHuClient(AbstractApiClient):
|
|||||||
if isinstance(params, dict):
|
if isinstance(params, dict):
|
||||||
final_uri += '?' + urlencode(params)
|
final_uri += '?' + urlencode(params)
|
||||||
headers = await self._pre_headers(final_uri)
|
headers = await self._pre_headers(final_uri)
|
||||||
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
|
base_url = (
|
||||||
|
zhihu_constant.ZHIHU_URL
|
||||||
|
if "/p/" not in uri
|
||||||
|
else zhihu_constant.ZHIHU_ZHUANLAN_URL
|
||||||
|
)
|
||||||
|
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
|
||||||
|
|
||||||
async def pong(self) -> bool:
|
async def pong(self) -> bool:
|
||||||
"""
|
"""
|
||||||
@@ -209,7 +214,7 @@ class ZhiHuClient(AbstractApiClient):
|
|||||||
return self._extractor.extract_contents_from_search(search_res)
|
return self._extractor.extract_contents_from_search(search_res)
|
||||||
|
|
||||||
async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
|
async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
|
||||||
order_by: str = "sort") -> Dict:
|
order_by: str = "score") -> Dict:
|
||||||
"""
|
"""
|
||||||
获取内容的一级评论
|
获取内容的一级评论
|
||||||
Args:
|
Args:
|
||||||
@@ -222,13 +227,16 @@ class ZhiHuClient(AbstractApiClient):
|
|||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
|
uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
|
||||||
params = {
|
params = {"order": order_by, "offset": offset, "limit": limit}
|
||||||
"order": order_by,
|
|
||||||
"offset": offset,
|
|
||||||
"limit": limit
|
|
||||||
}
|
|
||||||
return await self.get(uri, params)
|
return await self.get(uri, params)
|
||||||
|
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
|
||||||
|
# params = {
|
||||||
|
# "order": order_by,
|
||||||
|
# "offset": offset,
|
||||||
|
# "limit": limit
|
||||||
|
# }
|
||||||
|
# return await self.get(uri, params)
|
||||||
|
|
||||||
async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
|
async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
|
||||||
order_by: str = "sort") -> Dict:
|
order_by: str = "sort") -> Dict:
|
||||||
@@ -496,3 +504,46 @@ class ZhiHuClient(AbstractApiClient):
|
|||||||
offset += limit
|
offset += limit
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
return all_contents
|
return all_contents
|
||||||
|
|
||||||
|
|
||||||
|
async def get_answer_info(
|
||||||
|
self, question_id: str, answer_id: str
|
||||||
|
) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
获取回答信息
|
||||||
|
Args:
|
||||||
|
question_id:
|
||||||
|
answer_id:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
uri = f"/question/{question_id}/answer/{answer_id}"
|
||||||
|
response_html = await self.get(uri, return_response=True)
|
||||||
|
return self._extractor.extract_answer_content_from_html(response_html)
|
||||||
|
|
||||||
|
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
获取文章信息
|
||||||
|
Args:
|
||||||
|
article_id:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
uri = f"/p/{article_id}"
|
||||||
|
response_html = await self.get(uri, return_response=True)
|
||||||
|
return self._extractor.extract_article_content_from_html(response_html)
|
||||||
|
|
||||||
|
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
获取视频信息
|
||||||
|
Args:
|
||||||
|
video_id:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
uri = f"/zvideo/{video_id}"
|
||||||
|
response_html = await self.get(uri, return_response=True)
|
||||||
|
return self._extractor.extract_zvideo_content_from_html(response_html)
|
||||||
|
|||||||
@@ -14,12 +14,13 @@ import asyncio
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from asyncio import Task
|
from asyncio import Task
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple, cast
|
||||||
|
|
||||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||||
async_playwright)
|
async_playwright)
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
from constant import zhihu as constant
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
from model.m_zhihu import ZhihuContent, ZhihuCreator
|
from model.m_zhihu import ZhihuContent, ZhihuCreator
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
@@ -29,7 +30,7 @@ from var import crawler_type_var, source_keyword_var
|
|||||||
|
|
||||||
from .client import ZhiHuClient
|
from .client import ZhiHuClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
from .help import ZhihuExtractor
|
from .help import ZhihuExtractor, judge_zhihu_url
|
||||||
from .login import ZhiHuLogin
|
from .login import ZhiHuLogin
|
||||||
|
|
||||||
|
|
||||||
@@ -96,7 +97,7 @@ class ZhihuCrawler(AbstractCrawler):
|
|||||||
await self.search()
|
await self.search()
|
||||||
elif config.CRAWLER_TYPE == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
raise NotImplementedError
|
await self.get_specified_notes()
|
||||||
elif config.CRAWLER_TYPE == "creator":
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
# Get creator's information and their notes and comments
|
# Get creator's information and their notes and comments
|
||||||
await self.get_creators_and_notes()
|
await self.get_creators_and_notes()
|
||||||
@@ -226,6 +227,76 @@ class ZhihuCrawler(AbstractCrawler):
|
|||||||
# Get all comments of the creator's contents
|
# Get all comments of the creator's contents
|
||||||
await self.batch_get_content_comments(all_content_list)
|
await self.batch_get_content_comments(all_content_list)
|
||||||
|
|
||||||
|
async def get_note_detail(
|
||||||
|
self, full_note_url: str, semaphore: asyncio.Semaphore
|
||||||
|
) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
Get note detail
|
||||||
|
Args:
|
||||||
|
full_note_url: str
|
||||||
|
semaphore:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
async with semaphore:
|
||||||
|
utils.logger.info(
|
||||||
|
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
|
||||||
|
)
|
||||||
|
# judge note type
|
||||||
|
note_type: str = judge_zhihu_url(full_note_url)
|
||||||
|
if note_type == constant.ANSWER_NAME:
|
||||||
|
question_id = full_note_url.split("/")[-3]
|
||||||
|
answer_id = full_note_url.split("/")[-1]
|
||||||
|
utils.logger.info(
|
||||||
|
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
|
||||||
|
)
|
||||||
|
return await self.zhihu_client.get_answer_info(question_id, answer_id)
|
||||||
|
|
||||||
|
elif note_type == constant.ARTICLE_NAME:
|
||||||
|
article_id = full_note_url.split("/")[-1]
|
||||||
|
utils.logger.info(
|
||||||
|
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
|
||||||
|
)
|
||||||
|
return await self.zhihu_client.get_article_info(article_id)
|
||||||
|
|
||||||
|
elif note_type == constant.VIDEO_NAME:
|
||||||
|
video_id = full_note_url.split("/")[-1]
|
||||||
|
utils.logger.info(
|
||||||
|
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
|
||||||
|
)
|
||||||
|
return await self.zhihu_client.get_video_info(video_id)
|
||||||
|
|
||||||
|
async def get_specified_notes(self):
|
||||||
|
"""
|
||||||
|
Get the information and comments of the specified post
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
get_note_detail_task_list = []
|
||||||
|
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
|
||||||
|
# remove query params
|
||||||
|
full_note_url = full_note_url.split("?")[0]
|
||||||
|
crawler_task = self.get_note_detail(
|
||||||
|
full_note_url=full_note_url,
|
||||||
|
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
|
||||||
|
)
|
||||||
|
get_note_detail_task_list.append(crawler_task)
|
||||||
|
|
||||||
|
need_get_comment_notes: List[ZhihuContent] = []
|
||||||
|
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||||
|
for index, note_detail in enumerate(note_details):
|
||||||
|
if not note_detail:
|
||||||
|
utils.logger.info(
|
||||||
|
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
note_detail = cast(ZhihuContent, note_detail) # only for type check
|
||||||
|
need_get_comment_notes.append(note_detail)
|
||||||
|
await zhihu_store.update_zhihu_content(note_detail)
|
||||||
|
|
||||||
|
await self.batch_get_content_comments(need_get_comment_notes)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||||
|
|||||||
@@ -159,15 +159,13 @@ class ZhihuExtractor:
|
|||||||
res = ZhihuContent()
|
res = ZhihuContent()
|
||||||
|
|
||||||
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
|
||||||
res.content_id = zvideo.get("video").get("video_id")
|
|
||||||
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
|
||||||
res.created_time = zvideo.get("published_at")
|
res.created_time = zvideo.get("published_at")
|
||||||
res.updated_time = zvideo.get("updated_at")
|
res.updated_time = zvideo.get("updated_at")
|
||||||
else:
|
else:
|
||||||
res.content_id = zvideo.get("zvideo_id")
|
|
||||||
res.content_url = zvideo.get("video_url")
|
res.content_url = zvideo.get("video_url")
|
||||||
res.created_time = zvideo.get("created_at")
|
res.created_time = zvideo.get("created_at")
|
||||||
|
res.content_id = zvideo.get("id")
|
||||||
res.content_type = zvideo.get("type")
|
res.content_type = zvideo.get("type")
|
||||||
res.title = extract_text_from_html(zvideo.get("title"))
|
res.title = extract_text_from_html(zvideo.get("title"))
|
||||||
res.desc = extract_text_from_html(zvideo.get("description"))
|
res.desc = extract_text_from_html(zvideo.get("description"))
|
||||||
@@ -369,3 +367,94 @@ class ZhihuExtractor:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
return self._extract_content_list(anwser_list)
|
return self._extract_content_list(anwser_list)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
extract zhihu answer content from html
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||||
|
if not js_init_data:
|
||||||
|
return None
|
||||||
|
json_data: Dict = json.loads(js_init_data)
|
||||||
|
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
|
||||||
|
if not answer_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
|
||||||
|
|
||||||
|
def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
extract zhihu article content from html
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||||
|
if not js_init_data:
|
||||||
|
return None
|
||||||
|
json_data: Dict = json.loads(js_init_data)
|
||||||
|
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
|
||||||
|
if not article_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
|
||||||
|
|
||||||
|
def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
|
||||||
|
"""
|
||||||
|
extract zhihu zvideo content from html
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
|
||||||
|
if not js_init_data:
|
||||||
|
return None
|
||||||
|
json_data: Dict = json.loads(js_init_data)
|
||||||
|
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
|
||||||
|
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
|
||||||
|
if not zvideo_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# handler user info and video info
|
||||||
|
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
|
||||||
|
if not video_detail_info:
|
||||||
|
return None
|
||||||
|
if isinstance(video_detail_info.get("author"), str):
|
||||||
|
author_name: str = video_detail_info.get("author")
|
||||||
|
video_detail_info["author"] = users.get(author_name)
|
||||||
|
|
||||||
|
return self._extract_zvideo_content(video_detail_info)
|
||||||
|
|
||||||
|
|
||||||
|
def judge_zhihu_url(note_detail_url: str) -> str:
|
||||||
|
"""
|
||||||
|
judge zhihu url type
|
||||||
|
Args:
|
||||||
|
note_detail_url:
|
||||||
|
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
|
||||||
|
eg2: https://www.zhihu.com/p/123456789 # article
|
||||||
|
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
if "/answer/" in note_detail_url:
|
||||||
|
return zhihu_constant.ANSWER_NAME
|
||||||
|
elif "/p/" in note_detail_url:
|
||||||
|
return zhihu_constant.ARTICLE_NAME
|
||||||
|
elif "/zvideo/" in note_detail_url:
|
||||||
|
return zhihu_constant.VIDEO_NAME
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|||||||
Reference in New Issue
Block a user