diff --git a/main.py b/main.py index a822e67..b4c55a1 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler from media_platform.weibo import WeiboCrawler from media_platform.xhs import XiaoHongShuCrawler from media_platform.zhihu import ZhihuCrawler +from tools.async_file_writer import AsyncFileWriter +from var import crawler_type_var class CrawlerFactory: @@ -72,6 +74,18 @@ async def main(): crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) await crawler.start() + # Generate wordcloud after crawling is complete + # Only for JSON save mode + if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD: + try: + file_writer = AsyncFileWriter( + platform=config.PLATFORM, + crawler_type=crawler_type_var.get() + ) + await file_writer.generate_wordcloud_from_comments() + except Exception as e: + print(f"Error generating wordcloud: {e}") + def cleanup(): if crawler: diff --git a/store/bilibili/_store_impl.py b/store/bilibili/_store_impl.py index 9872ea3..7084748 100644 --- a/store/bilibili/_store_impl.py +++ b/store/bilibili/_store_impl.py @@ -37,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore): def __init__(self): self.file_writer = AsyncFileWriter( crawler_type=crawler_type_var.get(), - platform="bilibili" + platform="bili" ) async def store_content(self, content_item: Dict): @@ -220,7 +220,7 @@ class BiliJsonStoreImplement(AbstractStore): def __init__(self): self.file_writer = AsyncFileWriter( crawler_type=crawler_type_var.get(), - platform="bilibili" + platform="bili" ) async def store_content(self, content_item: Dict): diff --git a/store/bilibili/bilibilli_store_media.py b/store/bilibili/bilibilli_store_media.py index 524e9fd..cacf8bf 100644 --- a/store/bilibili/bilibilli_store_media.py +++ b/store/bilibili/bilibilli_store_media.py @@ -22,7 +22,7 @@ from tools import utils class BilibiliVideo(AbstractStoreVideo): - video_store_path: str = "data/bilibili/videos" + video_store_path: str = "data/bili/videos" async def store_video(self, video_content_item: Dict): """ diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py index 972fff8..e133eee 100644 --- a/tools/async_file_writer.py +++ b/tools/async_file_writer.py @@ -5,13 +5,16 @@ import os import pathlib from typing import Dict, List import aiofiles +import config from tools.utils import utils +from tools.words import AsyncWordCloudGenerator class AsyncFileWriter: def __init__(self, platform: str, crawler_type: str): self.lock = asyncio.Lock() self.platform = platform self.crawler_type = crawler_type + self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None def _get_file_path(self, file_type: str, item_type: str) -> str: base_path = f"data/{self.platform}/{file_type}" @@ -47,4 +50,58 @@ class AsyncFileWriter: existing_data.append(item) async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: - await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4)) \ No newline at end of file + await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4)) + + async def generate_wordcloud_from_comments(self): + """ + Generate wordcloud from comments data + Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True + """ + if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS: + return + + if not self.wordcloud_generator: + return + + try: + # Read comments from JSON file + comments_file_path = self._get_file_path('json', 'comments') + if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0: + utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}") + return + + async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f: + content = await f.read() + if not content: + utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty") + return + + comments_data = json.loads(content) + if not isinstance(comments_data, list): + comments_data = [comments_data] + + # Filter comments data to only include 'content' field + # Handle different comment data structures across platforms + filtered_data = [] + for comment in comments_data: + if isinstance(comment, dict): + # Try different possible content field names + content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or '' + if content_text: + filtered_data.append({'content': content_text}) + + if not filtered_data: + utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found") + return + + # Generate wordcloud + words_base_path = f"data/{self.platform}/words" + pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True) + words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}" + + utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments") + await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix) + utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}") + + except Exception as e: + utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}") \ No newline at end of file diff --git a/tools/utils.py b/tools/utils.py index 80f01e2..20c72c8 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -26,6 +26,10 @@ def init_loging_config(): ) _logger = logging.getLogger("MediaCrawler") _logger.setLevel(level) + + # 关闭 httpx 的 INFO 日志 + logging.getLogger("httpx").setLevel(logging.WARNING) + return _logger