mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 11:29:27 +08:00
temp commit
This commit is contained in:
@@ -101,6 +101,11 @@ TIEBA_NAME_LIST = [
|
|||||||
# "盗墓笔记"
|
# "盗墓笔记"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
TIEBA_CREATOR_URL_LIST = [
|
||||||
|
"https://tieba.baidu.com/home/main/?id=tb.1.6a328702.02qx9GEBmrwqYDRyOgGKXQ&fr=frs",
|
||||||
|
# ........................
|
||||||
|
]
|
||||||
|
|
||||||
# 指定小红书创作者ID列表
|
# 指定小红书创作者ID列表
|
||||||
XHS_CREATOR_ID_LIST = [
|
XHS_CREATOR_ID_LIST = [
|
||||||
"63e36c9a000000002703502b",
|
"63e36c9a000000002703502b",
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
|||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator
|
||||||
from proxy.proxy_ip_pool import ProxyIpPool
|
from proxy.proxy_ip_pool import ProxyIpPool
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
@@ -272,8 +272,6 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
current_page += 1
|
current_page += 1
|
||||||
return all_sub_comments
|
return all_sub_comments
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||||
"""
|
"""
|
||||||
根据贴吧名称获取帖子列表
|
根据贴吧名称获取帖子列表
|
||||||
@@ -287,3 +285,71 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||||
page_content = await self.get(uri, return_ori_content=True)
|
page_content = await self.get(uri, return_ori_content=True)
|
||||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||||
|
|
||||||
|
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
|
||||||
|
"""
|
||||||
|
根据创作者ID获取创作者信息
|
||||||
|
Args:
|
||||||
|
creator_url: 创作者主页URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
||||||
|
return self._page_extractor.extract_creator_info(page_content)
|
||||||
|
|
||||||
|
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||||
|
"""
|
||||||
|
根据创作者获取创作者的所有帖子
|
||||||
|
Args:
|
||||||
|
user_name:
|
||||||
|
page_number:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
uri = f"/home/get/getthread"
|
||||||
|
params = {
|
||||||
|
"un": user_name,
|
||||||
|
"pn": page_number,
|
||||||
|
"id": "utf-8",
|
||||||
|
"_": utils.get_current_timestamp()
|
||||||
|
}
|
||||||
|
return await self.get(uri, params=params)
|
||||||
|
|
||||||
|
async def get_all_notes_by_creator_user_name(self, user_name: str,crawl_interval: float = 1.0,
|
||||||
|
callback: Optional[Callable] = None) -> List[TiebaNote]:
|
||||||
|
"""
|
||||||
|
根据创作者用户名获取创作者所有帖子
|
||||||
|
Args:
|
||||||
|
user_name:
|
||||||
|
crawl_interval:
|
||||||
|
callback:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
notes_has_more = 1
|
||||||
|
page_number = 1
|
||||||
|
while notes_has_more == 1:
|
||||||
|
notes_res = await self.get_notes_by_creator(user_name, page_number)
|
||||||
|
if not notes_res or notes_res.get("no") != 0:
|
||||||
|
utils.logger.error(
|
||||||
|
f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||||
|
break
|
||||||
|
|
||||||
|
notes_has_more = notes_res.get("has_more")
|
||||||
|
page_number += 1
|
||||||
|
notes = notes_res["thread_list"]
|
||||||
|
utils.logger.info(
|
||||||
|
f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||||
|
|
||||||
|
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
|
||||||
|
notes = await asyncio.gather(*note_detail_task)
|
||||||
|
if callback:
|
||||||
|
await callback(notes)
|
||||||
|
await asyncio.sleep(crawl_interval)
|
||||||
|
result.extend(notes)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -57,6 +57,9 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
elif config.CRAWLER_TYPE == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_notes()
|
await self.get_specified_notes()
|
||||||
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
|
# Get creator's information and their notes and comments
|
||||||
|
await self.get_creators_and_notes()
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -215,6 +218,38 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
callback=tieba_store.batch_update_tieba_note_comments
|
callback=tieba_store.batch_update_tieba_note_comments
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def get_creators_and_notes(self) -> None:
|
||||||
|
"""
|
||||||
|
Get creator's information and their notes and comments
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
||||||
|
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||||
|
createor_info: Dict = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
|
||||||
|
if createor_info:
|
||||||
|
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
|
||||||
|
if not createor_info:
|
||||||
|
raise Exception("Get creator info error")
|
||||||
|
user_id = createor_info.get("user_id")
|
||||||
|
await tieba_store.save_creator(user_id, user_info=createor_info)
|
||||||
|
|
||||||
|
# Get all note information of the creator
|
||||||
|
all_notes_list = await self.tieba_client.get_all_notes_by_creator_user_name(
|
||||||
|
user_name=createor_info.get("user_name"),
|
||||||
|
crawl_interval=0,
|
||||||
|
callback=tieba_store.batch_update_tieba_notes
|
||||||
|
)
|
||||||
|
|
||||||
|
await self.batch_get_note_comments(all_notes_list)
|
||||||
|
|
||||||
|
else:
|
||||||
|
utils.logger.error(
|
||||||
|
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def launch_browser(
|
async def launch_browser(
|
||||||
self,
|
self,
|
||||||
chromium: BrowserType,
|
chromium: BrowserType,
|
||||||
|
|||||||
@@ -3,13 +3,17 @@ import html
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
from urllib.parse import unquote, parse_qs
|
||||||
|
|
||||||
from parsel import Selector
|
from parsel import Selector
|
||||||
|
|
||||||
from constant import baidu_tieba as const
|
from constant import baidu_tieba as const
|
||||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
|
GENDER_MALE = "sex_male"
|
||||||
|
GENDER_FMALE = "sex_fmale"
|
||||||
|
|
||||||
|
|
||||||
class TieBaExtractor:
|
class TieBaExtractor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -199,8 +203,37 @@ class TieBaExtractor:
|
|||||||
|
|
||||||
return comments
|
return comments
|
||||||
|
|
||||||
@staticmethod
|
def extract_creator_info(self, html_content: str) -> TiebaCreator:
|
||||||
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]:
|
"""
|
||||||
|
提取贴吧创作者信息
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
selector = Selector(text=html_content)
|
||||||
|
user_link_selector = selector.xpath("//p[@class='space']/a")
|
||||||
|
user_link: str = user_link_selector.xpath("./@href").get(default='')
|
||||||
|
user_link_params: Dict = parse_qs(unquote(user_link))
|
||||||
|
user_name = user_link_params.get("un")[0] if user_link_params.get("un") else ""
|
||||||
|
user_id = user_link_params.get("id")[0] if user_link_params.get("id") else ""
|
||||||
|
userinfo_userdata_selector = selector.xpath("//div[@class='userinfo_userdata']")
|
||||||
|
creator = TiebaCreator(
|
||||||
|
user_id=user_id,
|
||||||
|
user_name=user_name,
|
||||||
|
nickname=selector.xpath(".//a[@class='userinfo_username']/text()").get(default='').strip(),
|
||||||
|
avatar=selector.xpath(".//div[@class='userinfo_left_head']//img/@src").get(default='').strip(),
|
||||||
|
gender=self.extract_gender(userinfo_userdata_selector.get(default='')),
|
||||||
|
ip_location=self.extract_ip(userinfo_userdata_selector.get(default='')),
|
||||||
|
follows=0,
|
||||||
|
fans=0,
|
||||||
|
follow_tieba_list="",
|
||||||
|
registration_duration="",
|
||||||
|
)
|
||||||
|
return creator
|
||||||
|
|
||||||
|
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
提取IP位置和发布时间
|
提取IP位置和发布时间
|
||||||
Args:
|
Args:
|
||||||
@@ -209,13 +242,37 @@ class TieBaExtractor:
|
|||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
|
||||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||||
ip_match = pattern_ip.search(html_content)
|
|
||||||
time_match = pattern_pub_time.search(html_content)
|
time_match = pattern_pub_time.search(html_content)
|
||||||
ip = ip_match.group(1) if ip_match else ""
|
|
||||||
pub_time = time_match.group(1) if time_match else ""
|
pub_time = time_match.group(1) if time_match else ""
|
||||||
return ip, pub_time
|
return self.extract_ip(html_content), pub_time
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_ip(html_content: str) -> str:
|
||||||
|
"""
|
||||||
|
提取IP
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||||
|
ip_match = pattern_ip.search(html_content)
|
||||||
|
ip = ip_match.group(1) if ip_match else ""
|
||||||
|
return ip
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_gender(html_content: str) -> str:
|
||||||
|
"""
|
||||||
|
提取性别
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_data_field_value(selector: Selector) -> Dict:
|
def extract_data_field_value(selector: Selector) -> Dict:
|
||||||
|
|||||||
@@ -44,3 +44,20 @@ class TiebaComment(BaseModel):
|
|||||||
tieba_name: str = Field(..., description="所属的贴吧名称")
|
tieba_name: str = Field(..., description="所属的贴吧名称")
|
||||||
tieba_link: str = Field(..., description="贴吧链接")
|
tieba_link: str = Field(..., description="贴吧链接")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TiebaCreator(BaseModel):
|
||||||
|
"""
|
||||||
|
百度贴吧创作者
|
||||||
|
"""
|
||||||
|
user_id: str = Field(..., description="用户ID")
|
||||||
|
user_name: str = Field(..., description="用户名")
|
||||||
|
nickname: str = Field(..., description="用户昵称")
|
||||||
|
gender: str = Field(default="", description="用户性别")
|
||||||
|
avatar: str = Field(..., description="用户头像地址")
|
||||||
|
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||||
|
follows: int = Field(default=0, description="关注数")
|
||||||
|
fans: int = Field(default=0, description="粉丝数")
|
||||||
|
follow_tieba_list: str = Field(default="", description="关注的贴吧列表")
|
||||||
|
registration_duration: str = Field(default="", description="注册时长")
|
||||||
|
|
||||||
|
|||||||
@@ -23,6 +23,19 @@ class TieBaStoreFactory:
|
|||||||
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
|
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
|
||||||
return store_class()
|
return store_class()
|
||||||
|
|
||||||
|
async def batch_update_tieba_notes(note_list: List[TiebaNote]):
|
||||||
|
"""
|
||||||
|
Batch update tieba notes
|
||||||
|
Args:
|
||||||
|
note_list:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not note_list:
|
||||||
|
return
|
||||||
|
for note_item in note_list:
|
||||||
|
await update_tieba_note(note_item)
|
||||||
|
|
||||||
async def update_tieba_note(note_item: TiebaNote):
|
async def update_tieba_note(note_item: TiebaNote):
|
||||||
"""
|
"""
|
||||||
@@ -71,3 +84,29 @@ async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment):
|
|||||||
save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||||
utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}")
|
utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}")
|
||||||
await TieBaStoreFactory.create_store().store_comment(save_comment_item)
|
await TieBaStoreFactory.create_store().store_comment(save_comment_item)
|
||||||
|
|
||||||
|
|
||||||
|
async def save_creator(user_id: str, user_info: Dict):
|
||||||
|
"""
|
||||||
|
Save creator information to local
|
||||||
|
Args:
|
||||||
|
user_id:
|
||||||
|
user_info:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
local_db_item = {
|
||||||
|
'user_id': user_id,
|
||||||
|
'nickname': user_info.get('nickname'),
|
||||||
|
'gender': '女' if user_info.get('gender') == "f" else '男',
|
||||||
|
'avatar': user_info.get('avatar'),
|
||||||
|
'ip_location': user_info.get("ip_location", ""),
|
||||||
|
'follows': user_info.get('follow_count', ''),
|
||||||
|
'fans': user_info.get('followers_count', ''),
|
||||||
|
'follow_tieba_list': user_info.get("tieba_list", ''),
|
||||||
|
'last_modify_ts': utils.get_current_timestamp(),
|
||||||
|
'registration_duration': user_info.get("registration_duration", ""),
|
||||||
|
}
|
||||||
|
utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}")
|
||||||
|
await TieBaStoreFactory.create_store().store_creator(local_db_item)
|
||||||
Reference in New Issue
Block a user