temp commit

This commit is contained in:
Relakkes
2024-08-24 09:12:03 +08:00
parent 65699aa1cb
commit 8adb593ba6
6 changed files with 229 additions and 10 deletions

View File

@@ -101,6 +101,11 @@ TIEBA_NAME_LIST = [
# "盗墓笔记" # "盗墓笔记"
] ]
TIEBA_CREATOR_URL_LIST = [
"https://tieba.baidu.com/home/main/?id=tb.1.6a328702.02qx9GEBmrwqYDRyOgGKXQ&fr=frs",
# ........................
]
# 指定小红书创作者ID列表 # 指定小红书创作者ID列表
XHS_CREATOR_ID_LIST = [ XHS_CREATOR_ID_LIST = [
"63e36c9a000000002703502b", "63e36c9a000000002703502b",

View File

@@ -9,7 +9,7 @@ from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
import config import config
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
from model.m_baidu_tieba import TiebaComment, TiebaNote from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator
from proxy.proxy_ip_pool import ProxyIpPool from proxy.proxy_ip_pool import ProxyIpPool
from tools import utils from tools import utils
@@ -272,8 +272,6 @@ class BaiduTieBaClient(AbstractApiClient):
current_page += 1 current_page += 1
return all_sub_comments return all_sub_comments
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
""" """
根据贴吧名称获取帖子列表 根据贴吧名称获取帖子列表
@@ -287,3 +285,71 @@ class BaiduTieBaClient(AbstractApiClient):
uri = f"/f?kw={tieba_name}&pn={page_num}" uri = f"/f?kw={tieba_name}&pn={page_num}"
page_content = await self.get(uri, return_ori_content=True) page_content = await self.get(uri, return_ori_content=True)
return self._page_extractor.extract_tieba_note_list(page_content) return self._page_extractor.extract_tieba_note_list(page_content)
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
"""
根据创作者ID获取创作者信息
Args:
creator_url: 创作者主页URL
Returns:
"""
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
return self._page_extractor.extract_creator_info(page_content)
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
"""
根据创作者获取创作者的所有帖子
Args:
user_name:
page_number:
Returns:
"""
uri = f"/home/get/getthread"
params = {
"un": user_name,
"pn": page_number,
"id": "utf-8",
"_": utils.get_current_timestamp()
}
return await self.get(uri, params=params)
async def get_all_notes_by_creator_user_name(self, user_name: str,crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[TiebaNote]:
"""
根据创作者用户名获取创作者所有帖子
Args:
user_name:
crawl_interval:
callback:
Returns:
"""
result = []
notes_has_more = 1
page_number = 1
while notes_has_more == 1:
notes_res = await self.get_notes_by_creator(user_name, page_number)
if not notes_res or notes_res.get("no") != 0:
utils.logger.error(
f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
break
notes_has_more = notes_res.get("has_more")
page_number += 1
notes = notes_res["thread_list"]
utils.logger.info(
f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
notes = await asyncio.gather(*note_detail_task)
if callback:
await callback(notes)
await asyncio.sleep(crawl_interval)
result.extend(notes)
return result

View File

@@ -57,6 +57,9 @@ class TieBaCrawler(AbstractCrawler):
elif config.CRAWLER_TYPE == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else: else:
pass pass
@@ -215,6 +218,38 @@ class TieBaCrawler(AbstractCrawler):
callback=tieba_store.batch_update_tieba_note_comments callback=tieba_store.batch_update_tieba_note_comments
) )
async def get_creators_and_notes(self) -> None:
"""
Get creator's information and their notes and comments
Returns:
"""
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
for creator_url in config.TIEBA_CREATOR_URL_LIST:
createor_info: Dict = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
if createor_info:
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
if not createor_info:
raise Exception("Get creator info error")
user_id = createor_info.get("user_id")
await tieba_store.save_creator(user_id, user_info=createor_info)
# Get all note information of the creator
all_notes_list = await self.tieba_client.get_all_notes_by_creator_user_name(
user_name=createor_info.get("user_name"),
crawl_interval=0,
callback=tieba_store.batch_update_tieba_notes
)
await self.batch_get_note_comments(all_notes_list)
else:
utils.logger.error(
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}")
async def launch_browser( async def launch_browser(
self, self,
chromium: BrowserType, chromium: BrowserType,

View File

@@ -3,13 +3,17 @@ import html
import json import json
import re import re
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from urllib.parse import unquote, parse_qs
from parsel import Selector from parsel import Selector
from constant import baidu_tieba as const from constant import baidu_tieba as const
from model.m_baidu_tieba import TiebaComment, TiebaNote from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator
from tools import utils from tools import utils
GENDER_MALE = "sex_male"
GENDER_FMALE = "sex_fmale"
class TieBaExtractor: class TieBaExtractor:
def __init__(self): def __init__(self):
@@ -199,8 +203,37 @@ class TieBaExtractor:
return comments return comments
@staticmethod def extract_creator_info(self, html_content: str) -> TiebaCreator:
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]: """
提取贴吧创作者信息
Args:
html_content:
Returns:
"""
selector = Selector(text=html_content)
user_link_selector = selector.xpath("//p[@class='space']/a")
user_link: str = user_link_selector.xpath("./@href").get(default='')
user_link_params: Dict = parse_qs(unquote(user_link))
user_name = user_link_params.get("un")[0] if user_link_params.get("un") else ""
user_id = user_link_params.get("id")[0] if user_link_params.get("id") else ""
userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']")
creator = TiebaCreator(
user_id=user_id,
user_name=user_name,
nickname=selector.xpath(".//a[@class='userinfo_username']/text()").get(default='').strip(),
avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(default='').strip(),
gender=self.extract_gender(userinfo_userdata_selector.get(default='')),
ip_location=self.extract_ip(userinfo_userdata_selector.get(default='')),
follows=0,
fans=0,
follow_tieba_list="",
registration_duration="",
)
return creator
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
""" """
提取IP位置和发布时间 提取IP位置和发布时间
Args: Args:
@@ -209,13 +242,37 @@ class TieBaExtractor:
Returns: Returns:
""" """
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>') pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
ip_match = pattern_ip.search(html_content)
time_match = pattern_pub_time.search(html_content) time_match = pattern_pub_time.search(html_content)
ip = ip_match.group(1) if ip_match else ""
pub_time = time_match.group(1) if time_match else "" pub_time = time_match.group(1) if time_match else ""
return ip, pub_time return self.extract_ip(html_content), pub_time
@staticmethod
def extract_ip(html_content: str) -> str:
"""
提取IP
Args:
html_content:
Returns:
"""
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
ip_match = pattern_ip.search(html_content)
ip = ip_match.group(1) if ip_match else ""
return ip
@staticmethod
def extract_gender(html_content: str) -> str:
"""
提取性别
Args:
html_content:
Returns:
"""
pass
@staticmethod @staticmethod
def extract_data_field_value(selector: Selector) -> Dict: def extract_data_field_value(selector: Selector) -> Dict:

View File

@@ -44,3 +44,20 @@ class TiebaComment(BaseModel):
tieba_name: str = Field(..., description="所属的贴吧名称") tieba_name: str = Field(..., description="所属的贴吧名称")
tieba_link: str = Field(..., description="贴吧链接") tieba_link: str = Field(..., description="贴吧链接")
class TiebaCreator(BaseModel):
"""
百度贴吧创作者
"""
user_id: str = Field(..., description="用户ID")
user_name: str = Field(..., description="用户名")
nickname: str = Field(..., description="用户昵称")
gender: str = Field(default="", description="用户性别")
avatar: str = Field(..., description="用户头像地址")
ip_location: Optional[str] = Field(default="", description="IP地理位置")
follows: int = Field(default=0, description="关注数")
fans: int = Field(default=0, description="粉丝数")
follow_tieba_list: str = Field(default="", description="关注的贴吧列表")
registration_duration: str = Field(default="", description="注册时长")

View File

@@ -23,6 +23,19 @@ class TieBaStoreFactory:
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...") "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
return store_class() return store_class()
async def batch_update_tieba_notes(note_list: List[TiebaNote]):
"""
Batch update tieba notes
Args:
note_list:
Returns:
"""
if not note_list:
return
for note_item in note_list:
await update_tieba_note(note_item)
async def update_tieba_note(note_item: TiebaNote): async def update_tieba_note(note_item: TiebaNote):
""" """
@@ -71,3 +84,29 @@ async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment):
save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()}) save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()})
utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}") utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}")
await TieBaStoreFactory.create_store().store_comment(save_comment_item) await TieBaStoreFactory.create_store().store_comment(save_comment_item)
async def save_creator(user_id: str, user_info: Dict):
"""
Save creator information to local
Args:
user_id:
user_info:
Returns:
"""
local_db_item = {
'user_id': user_id,
'nickname': user_info.get('nickname'),
'gender': '' if user_info.get('gender') == "f" else '',
'avatar': user_info.get('avatar'),
'ip_location': user_info.get("ip_location", ""),
'follows': user_info.get('follow_count', ''),
'fans': user_info.get('followers_count', ''),
'follow_tieba_list': user_info.get("tieba_list", ''),
'last_modify_ts': utils.get_current_timestamp(),
'registration_duration': user_info.get("registration_duration", ""),
}
utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}")
await TieBaStoreFactory.create_store().store_creator(local_db_item)