From 8adb593ba6a2f5826f3f0b5fcb03037ff6aff8d0 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Sat, 24 Aug 2024 09:12:03 +0800 Subject: [PATCH] temp commit --- config/base_config.py | 5 +++ media_platform/tieba/client.py | 72 ++++++++++++++++++++++++++++++++-- media_platform/tieba/core.py | 35 +++++++++++++++++ media_platform/tieba/help.py | 71 +++++++++++++++++++++++++++++---- model/m_baidu_tieba.py | 17 ++++++++ store/tieba/__init__.py | 39 ++++++++++++++++++ 6 files changed, 229 insertions(+), 10 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index 6456727..4d389e4 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -101,6 +101,11 @@ TIEBA_NAME_LIST = [ # "盗墓笔记" ] +TIEBA_CREATOR_URL_LIST = [ + "https://tieba.baidu.com/home/main/?id=tb.1.6a328702.02qx9GEBmrwqYDRyOgGKXQ&fr=frs", + # ........................ +] + # 指定小红书创作者ID列表 XHS_CREATOR_ID_LIST = [ "63e36c9a000000002703502b", diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index daa1c4c..cb83c4a 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -9,7 +9,7 @@ from tenacity import RetryError, retry, stop_after_attempt, wait_fixed import config from base.base_crawler import AbstractApiClient -from model.m_baidu_tieba import TiebaComment, TiebaNote +from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator from proxy.proxy_ip_pool import ProxyIpPool from tools import utils @@ -272,8 +272,6 @@ class BaiduTieBaClient(AbstractApiClient): current_page += 1 return all_sub_comments - - async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: """ 根据贴吧名称获取帖子列表 @@ -287,3 +285,71 @@ class BaiduTieBaClient(AbstractApiClient): uri = f"/f?kw={tieba_name}&pn={page_num}" page_content = await self.get(uri, return_ori_content=True) return self._page_extractor.extract_tieba_note_list(page_content) + + async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator: + """ + 根据创作者ID获取创作者信息 + Args: + creator_url: 创作者主页URL + + Returns: + + """ + page_content = await self.request(method="GET", url=creator_url, return_ori_content=True) + return self._page_extractor.extract_creator_info(page_content) + + async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict: + """ + 根据创作者获取创作者的所有帖子 + Args: + user_name: + page_number: + + Returns: + + """ + uri = f"/home/get/getthread" + params = { + "un": user_name, + "pn": page_number, + "id": "utf-8", + "_": utils.get_current_timestamp() + } + return await self.get(uri, params=params) + + async def get_all_notes_by_creator_user_name(self, user_name: str,crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[TiebaNote]: + """ + 根据创作者用户名获取创作者所有帖子 + Args: + user_name: + crawl_interval: + callback: + + Returns: + + """ + result = [] + notes_has_more = 1 + page_number = 1 + while notes_has_more == 1: + notes_res = await self.get_notes_by_creator(user_name, page_number) + if not notes_res or notes_res.get("no") != 0: + utils.logger.error( + f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}") + break + + notes_has_more = notes_res.get("has_more") + page_number += 1 + notes = notes_res["thread_list"] + utils.logger.info( + f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}") + + note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes] + notes = await asyncio.gather(*note_detail_task) + if callback: + await callback(notes) + await asyncio.sleep(crawl_interval) + result.extend(notes) + return result + diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 2edc2da..7a78382 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -57,6 +57,9 @@ class TieBaCrawler(AbstractCrawler): elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_notes() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their notes and comments + await self.get_creators_and_notes() else: pass @@ -215,6 +218,38 @@ class TieBaCrawler(AbstractCrawler): callback=tieba_store.batch_update_tieba_note_comments ) + async def get_creators_and_notes(self) -> None: + """ + Get creator's information and their notes and comments + Returns: + + """ + utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators") + for creator_url in config.TIEBA_CREATOR_URL_LIST: + createor_info: Dict = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url) + if createor_info: + utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}") + if not createor_info: + raise Exception("Get creator info error") + user_id = createor_info.get("user_id") + await tieba_store.save_creator(user_id, user_info=createor_info) + + # Get all note information of the creator + all_notes_list = await self.tieba_client.get_all_notes_by_creator_user_name( + user_name=createor_info.get("user_name"), + crawl_interval=0, + callback=tieba_store.batch_update_tieba_notes + ) + + await self.batch_get_note_comments(all_notes_list) + + else: + utils.logger.error( + f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}") + + + + async def launch_browser( self, chromium: BrowserType, diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 4f3fe15..b0db7d9 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -3,13 +3,17 @@ import html import json import re from typing import Dict, List, Tuple +from urllib.parse import unquote, parse_qs from parsel import Selector from constant import baidu_tieba as const -from model.m_baidu_tieba import TiebaComment, TiebaNote +from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator from tools import utils +GENDER_MALE = "sex_male" +GENDER_FMALE = "sex_fmale" + class TieBaExtractor: def __init__(self): @@ -199,8 +203,37 @@ class TieBaExtractor: return comments - @staticmethod - def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]: + def extract_creator_info(self, html_content: str) -> TiebaCreator: + """ + 提取贴吧创作者信息 + Args: + html_content: + + Returns: + + """ + selector = Selector(text=html_content) + user_link_selector = selector.xpath("//p[@class='space']/a") + user_link: str = user_link_selector.xpath("./@href").get(default='') + user_link_params: Dict = parse_qs(unquote(user_link)) + user_name = user_link_params.get("un")[0] if user_link_params.get("un") else "" + user_id = user_link_params.get("id")[0] if user_link_params.get("id") else "" + userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']") + creator = TiebaCreator( + user_id=user_id, + user_name=user_name, + nickname=selector.xpath(".//a[@class='userinfo_username']/text()").get(default='').strip(), + avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(default='').strip(), + gender=self.extract_gender(userinfo_userdata_selector.get(default='')), + ip_location=self.extract_ip(userinfo_userdata_selector.get(default='')), + follows=0, + fans=0, + follow_tieba_list="", + registration_duration="", + ) + return creator + + def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]: """ 提取IP位置和发布时间 Args: @@ -209,13 +242,37 @@ class TieBaExtractor: Returns: """ - pattern_ip = re.compile(r'IP属地:(\S+)') pattern_pub_time = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2})') - ip_match = pattern_ip.search(html_content) time_match = pattern_pub_time.search(html_content) - ip = ip_match.group(1) if ip_match else "" pub_time = time_match.group(1) if time_match else "" - return ip, pub_time + return self.extract_ip(html_content), pub_time + + @staticmethod + def extract_ip(html_content: str) -> str: + """ + 提取IP + Args: + html_content: + + Returns: + + """ + pattern_ip = re.compile(r'IP属地:(\S+)') + ip_match = pattern_ip.search(html_content) + ip = ip_match.group(1) if ip_match else "" + return ip + + @staticmethod + def extract_gender(html_content: str) -> str: + """ + 提取性别 + Args: + html_content: + + Returns: + + """ + pass @staticmethod def extract_data_field_value(selector: Selector) -> Dict: diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index 2f7d1b8..5a20bd6 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -44,3 +44,20 @@ class TiebaComment(BaseModel): tieba_name: str = Field(..., description="所属的贴吧名称") tieba_link: str = Field(..., description="贴吧链接") + + +class TiebaCreator(BaseModel): + """ + 百度贴吧创作者 + """ + user_id: str = Field(..., description="用户ID") + user_name: str = Field(..., description="用户名") + nickname: str = Field(..., description="用户昵称") + gender: str = Field(default="", description="用户性别") + avatar: str = Field(..., description="用户头像地址") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + follows: int = Field(default=0, description="关注数") + fans: int = Field(default=0, description="粉丝数") + follow_tieba_list: str = Field(default="", description="关注的贴吧列表") + registration_duration: str = Field(default="", description="注册时长") + diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 8de8876..e6708a5 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -23,6 +23,19 @@ class TieBaStoreFactory: "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...") return store_class() +async def batch_update_tieba_notes(note_list: List[TiebaNote]): + """ + Batch update tieba notes + Args: + note_list: + + Returns: + + """ + if not note_list: + return + for note_item in note_list: + await update_tieba_note(note_item) async def update_tieba_note(note_item: TiebaNote): """ @@ -71,3 +84,29 @@ async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment): save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()}) utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}") await TieBaStoreFactory.create_store().store_comment(save_comment_item) + + +async def save_creator(user_id: str, user_info: Dict): + """ + Save creator information to local + Args: + user_id: + user_info: + + Returns: + + """ + local_db_item = { + 'user_id': user_id, + 'nickname': user_info.get('nickname'), + 'gender': '女' if user_info.get('gender') == "f" else '男', + 'avatar': user_info.get('avatar'), + 'ip_location': user_info.get("ip_location", ""), + 'follows': user_info.get('follow_count', ''), + 'fans': user_info.get('followers_count', ''), + 'follow_tieba_list': user_info.get("tieba_list", ''), + 'last_modify_ts': utils.get_current_timestamp(), + 'registration_duration': user_info.get("registration_duration", ""), + } + utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}") + await TieBaStoreFactory.create_store().store_creator(local_db_item) \ No newline at end of file