mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
temp commit
This commit is contained in:
@@ -101,6 +101,11 @@ TIEBA_NAME_LIST = [
|
||||
# "盗墓笔记"
|
||||
]
|
||||
|
||||
TIEBA_CREATOR_URL_LIST = [
|
||||
"https://tieba.baidu.com/home/main/?id=tb.1.6a328702.02qx9GEBmrwqYDRyOgGKXQ&fr=frs",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定小红书创作者ID列表
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
"63e36c9a000000002703502b",
|
||||
|
||||
@@ -9,7 +9,7 @@ from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
from tools import utils
|
||||
|
||||
@@ -272,8 +272,6 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
||||
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表
|
||||
@@ -287,3 +285,71 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
|
||||
"""
|
||||
根据创作者ID获取创作者信息
|
||||
Args:
|
||||
creator_url: 创作者主页URL
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
||||
return self._page_extractor.extract_creator_info(page_content)
|
||||
|
||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||
"""
|
||||
根据创作者获取创作者的所有帖子
|
||||
Args:
|
||||
user_name:
|
||||
page_number:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/home/get/getthread"
|
||||
params = {
|
||||
"un": user_name,
|
||||
"pn": page_number,
|
||||
"id": "utf-8",
|
||||
"_": utils.get_current_timestamp()
|
||||
}
|
||||
return await self.get(uri, params=params)
|
||||
|
||||
async def get_all_notes_by_creator_user_name(self, user_name: str,crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None) -> List[TiebaNote]:
|
||||
"""
|
||||
根据创作者用户名获取创作者所有帖子
|
||||
Args:
|
||||
user_name:
|
||||
crawl_interval:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
notes_has_more = 1
|
||||
page_number = 1
|
||||
while notes_has_more == 1:
|
||||
notes_res = await self.get_notes_by_creator(user_name, page_number)
|
||||
if not notes_res or notes_res.get("no") != 0:
|
||||
utils.logger.error(
|
||||
f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||
break
|
||||
|
||||
notes_has_more = notes_res.get("has_more")
|
||||
page_number += 1
|
||||
notes = notes_res["thread_list"]
|
||||
utils.logger.info(
|
||||
f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||
|
||||
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
if callback:
|
||||
await callback(notes)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(notes)
|
||||
return result
|
||||
|
||||
|
||||
@@ -57,6 +57,9 @@ class TieBaCrawler(AbstractCrawler):
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their notes and comments
|
||||
await self.get_creators_and_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
@@ -215,6 +218,38 @@ class TieBaCrawler(AbstractCrawler):
|
||||
callback=tieba_store.batch_update_tieba_note_comments
|
||||
)
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""
|
||||
Get creator's information and their notes and comments
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
||||
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||
createor_info: Dict = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
|
||||
if createor_info:
|
||||
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
|
||||
if not createor_info:
|
||||
raise Exception("Get creator info error")
|
||||
user_id = createor_info.get("user_id")
|
||||
await tieba_store.save_creator(user_id, user_info=createor_info)
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = await self.tieba_client.get_all_notes_by_creator_user_name(
|
||||
user_name=createor_info.get("user_name"),
|
||||
crawl_interval=0,
|
||||
callback=tieba_store.batch_update_tieba_notes
|
||||
)
|
||||
|
||||
await self.batch_get_note_comments(all_notes_list)
|
||||
|
||||
else:
|
||||
utils.logger.error(
|
||||
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}")
|
||||
|
||||
|
||||
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
|
||||
@@ -3,13 +3,17 @@ import html
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
from urllib.parse import unquote, parse_qs
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
from constant import baidu_tieba as const
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator
|
||||
from tools import utils
|
||||
|
||||
GENDER_MALE = "sex_male"
|
||||
GENDER_FMALE = "sex_fmale"
|
||||
|
||||
|
||||
class TieBaExtractor:
|
||||
def __init__(self):
|
||||
@@ -199,8 +203,37 @@ class TieBaExtractor:
|
||||
|
||||
return comments
|
||||
|
||||
@staticmethod
|
||||
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]:
|
||||
def extract_creator_info(self, html_content: str) -> TiebaCreator:
|
||||
"""
|
||||
提取贴吧创作者信息
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(text=html_content)
|
||||
user_link_selector = selector.xpath("//p[@class='space']/a")
|
||||
user_link: str = user_link_selector.xpath("./@href").get(default='')
|
||||
user_link_params: Dict = parse_qs(unquote(user_link))
|
||||
user_name = user_link_params.get("un")[0] if user_link_params.get("un") else ""
|
||||
user_id = user_link_params.get("id")[0] if user_link_params.get("id") else ""
|
||||
userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']")
|
||||
creator = TiebaCreator(
|
||||
user_id=user_id,
|
||||
user_name=user_name,
|
||||
nickname=selector.xpath(".//a[@class='userinfo_username']/text()").get(default='').strip(),
|
||||
avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(default='').strip(),
|
||||
gender=self.extract_gender(userinfo_userdata_selector.get(default='')),
|
||||
ip_location=self.extract_ip(userinfo_userdata_selector.get(default='')),
|
||||
follows=0,
|
||||
fans=0,
|
||||
follow_tieba_list="",
|
||||
registration_duration="",
|
||||
)
|
||||
return creator
|
||||
|
||||
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
提取IP位置和发布时间
|
||||
Args:
|
||||
@@ -209,13 +242,37 @@ class TieBaExtractor:
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
time_match = pattern_pub_time.search(html_content)
|
||||
ip = ip_match.group(1) if ip_match else ""
|
||||
pub_time = time_match.group(1) if time_match else ""
|
||||
return ip, pub_time
|
||||
return self.extract_ip(html_content), pub_time
|
||||
|
||||
@staticmethod
|
||||
def extract_ip(html_content: str) -> str:
|
||||
"""
|
||||
提取IP
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
ip = ip_match.group(1) if ip_match else ""
|
||||
return ip
|
||||
|
||||
@staticmethod
|
||||
def extract_gender(html_content: str) -> str:
|
||||
"""
|
||||
提取性别
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def extract_data_field_value(selector: Selector) -> Dict:
|
||||
|
||||
@@ -44,3 +44,20 @@ class TiebaComment(BaseModel):
|
||||
tieba_name: str = Field(..., description="所属的贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
|
||||
|
||||
|
||||
class TiebaCreator(BaseModel):
|
||||
"""
|
||||
百度贴吧创作者
|
||||
"""
|
||||
user_id: str = Field(..., description="用户ID")
|
||||
user_name: str = Field(..., description="用户名")
|
||||
nickname: str = Field(..., description="用户昵称")
|
||||
gender: str = Field(default="", description="用户性别")
|
||||
avatar: str = Field(..., description="用户头像地址")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
follows: int = Field(default=0, description="关注数")
|
||||
fans: int = Field(default=0, description="粉丝数")
|
||||
follow_tieba_list: str = Field(default="", description="关注的贴吧列表")
|
||||
registration_duration: str = Field(default="", description="注册时长")
|
||||
|
||||
|
||||
@@ -23,6 +23,19 @@ class TieBaStoreFactory:
|
||||
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
|
||||
return store_class()
|
||||
|
||||
async def batch_update_tieba_notes(note_list: List[TiebaNote]):
|
||||
"""
|
||||
Batch update tieba notes
|
||||
Args:
|
||||
note_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not note_list:
|
||||
return
|
||||
for note_item in note_list:
|
||||
await update_tieba_note(note_item)
|
||||
|
||||
async def update_tieba_note(note_item: TiebaNote):
|
||||
"""
|
||||
@@ -71,3 +84,29 @@ async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment):
|
||||
save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||
utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}")
|
||||
await TieBaStoreFactory.create_store().store_comment(save_comment_item)
|
||||
|
||||
|
||||
async def save_creator(user_id: str, user_info: Dict):
|
||||
"""
|
||||
Save creator information to local
|
||||
Args:
|
||||
user_id:
|
||||
user_info:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
local_db_item = {
|
||||
'user_id': user_id,
|
||||
'nickname': user_info.get('nickname'),
|
||||
'gender': '女' if user_info.get('gender') == "f" else '男',
|
||||
'avatar': user_info.get('avatar'),
|
||||
'ip_location': user_info.get("ip_location", ""),
|
||||
'follows': user_info.get('follow_count', ''),
|
||||
'fans': user_info.get('followers_count', ''),
|
||||
'follow_tieba_list': user_info.get("tieba_list", ''),
|
||||
'last_modify_ts': utils.get_current_timestamp(),
|
||||
'registration_duration': user_info.get("registration_duration", ""),
|
||||
}
|
||||
utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}")
|
||||
await TieBaStoreFactory.create_store().store_creator(local_db_item)
|
||||
Reference in New Issue
Block a user