fix: bili词云图修复

This commit is contained in:
程序员阿江(Relakkes)
2025-11-02 13:25:31 +08:00
parent 3f5925e326
commit 889fa01466
5 changed files with 79 additions and 4 deletions

14
main.py
View File

@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler
from media_platform.zhihu import ZhihuCrawler
from tools.async_file_writer import AsyncFileWriter
from var import crawler_type_var
class CrawlerFactory:
@@ -72,6 +74,18 @@ async def main():
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start()
# Generate wordcloud after crawling is complete
# Only for JSON save mode
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
try:
file_writer = AsyncFileWriter(
platform=config.PLATFORM,
crawler_type=crawler_type_var.get()
)
await file_writer.generate_wordcloud_from_comments()
except Exception as e:
print(f"Error generating wordcloud: {e}")
def cleanup():
if crawler:

View File

@@ -37,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore):
def __init__(self):
self.file_writer = AsyncFileWriter(
crawler_type=crawler_type_var.get(),
platform="bilibili"
platform="bili"
)
async def store_content(self, content_item: Dict):
@@ -220,7 +220,7 @@ class BiliJsonStoreImplement(AbstractStore):
def __init__(self):
self.file_writer = AsyncFileWriter(
crawler_type=crawler_type_var.get(),
platform="bilibili"
platform="bili"
)
async def store_content(self, content_item: Dict):

View File

@@ -22,7 +22,7 @@ from tools import utils
class BilibiliVideo(AbstractStoreVideo):
video_store_path: str = "data/bilibili/videos"
video_store_path: str = "data/bili/videos"
async def store_video(self, video_content_item: Dict):
"""

View File

@@ -5,13 +5,16 @@ import os
import pathlib
from typing import Dict, List
import aiofiles
import config
from tools.utils import utils
from tools.words import AsyncWordCloudGenerator
class AsyncFileWriter:
def __init__(self, platform: str, crawler_type: str):
self.lock = asyncio.Lock()
self.platform = platform
self.crawler_type = crawler_type
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
def _get_file_path(self, file_type: str, item_type: str) -> str:
base_path = f"data/{self.platform}/{file_type}"
@@ -47,4 +50,58 @@ class AsyncFileWriter:
existing_data.append(item)
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
async def generate_wordcloud_from_comments(self):
"""
Generate wordcloud from comments data
Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
"""
if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
return
if not self.wordcloud_generator:
return
try:
# Read comments from JSON file
comments_file_path = self._get_file_path('json', 'comments')
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
return
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
content = await f.read()
if not content:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
return
comments_data = json.loads(content)
if not isinstance(comments_data, list):
comments_data = [comments_data]
# Filter comments data to only include 'content' field
# Handle different comment data structures across platforms
filtered_data = []
for comment in comments_data:
if isinstance(comment, dict):
# Try different possible content field names
content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
if content_text:
filtered_data.append({'content': content_text})
if not filtered_data:
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
return
# Generate wordcloud
words_base_path = f"data/{self.platform}/words"
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
except Exception as e:
utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")

View File

@@ -26,6 +26,10 @@ def init_loging_config():
)
_logger = logging.getLogger("MediaCrawler")
_logger.setLevel(level)
# 关闭 httpx 的 INFO 日志
logging.getLogger("httpx").setLevel(logging.WARNING)
return _logger