fix: bili词云图修复

2025-11-25 03:15:17 +08:00 · 2025-11-02 13:25:31 +08:00
parent 3f5925e326
commit 889fa01466
5 changed files with 79 additions and 4 deletions
--- a/main.py
+++ b/main.py
@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
 from media_platform.weibo import WeiboCrawler
 from media_platform.xhs import XiaoHongShuCrawler
 from media_platform.zhihu import ZhihuCrawler
+from tools.async_file_writer import AsyncFileWriter
+from var import crawler_type_var


 class CrawlerFactory:
@@ -72,6 +74,18 @@ async def main():
    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
    await crawler.start()

+    # Generate wordcloud after crawling is complete
+    # Only for JSON save mode
+    if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
+        try:
+            file_writer = AsyncFileWriter(
+                platform=config.PLATFORM,
+                crawler_type=crawler_type_var.get()
+            )
+            await file_writer.generate_wordcloud_from_comments()
+        except Exception as e:
+            print(f"Error generating wordcloud: {e}")
+

 def cleanup():
    if crawler:
--- a/store/bilibili/_store_impl.py
+++ b/store/bilibili/_store_impl.py
@@ -37,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore):
    def __init__(self):
        self.file_writer = AsyncFileWriter(
            crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
        )

    async def store_content(self, content_item: Dict):
@@ -220,7 +220,7 @@ class BiliJsonStoreImplement(AbstractStore):
    def __init__(self):
        self.file_writer = AsyncFileWriter(
            crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
        )

    async def store_content(self, content_item: Dict):
--- a/store/bilibili/bilibilli_store_media.py
+++ b/store/bilibili/bilibilli_store_media.py
@@ -22,7 +22,7 @@ from tools import utils


 class BilibiliVideo(AbstractStoreVideo):
-    video_store_path: str = "data/bilibili/videos"
+    video_store_path: str = "data/bili/videos"

    async def store_video(self, video_content_item: Dict):
        """
--- a/tools/async_file_writer.py
+++ b/tools/async_file_writer.py
@@ -5,13 +5,16 @@ import os
 import pathlib
 from typing import Dict, List
 import aiofiles
+import config
 from tools.utils import utils
+from tools.words import AsyncWordCloudGenerator

 class AsyncFileWriter:
    def __init__(self, platform: str, crawler_type: str):
        self.lock = asyncio.Lock()
        self.platform = platform
        self.crawler_type = crawler_type
+        self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None

    def _get_file_path(self, file_type: str, item_type: str) -> str:
        base_path = f"data/{self.platform}/{file_type}"
@@ -47,4 +50,58 @@ class AsyncFileWriter:
            existing_data.append(item)

            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
-                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
+                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
+
+    async def generate_wordcloud_from_comments(self):
+        """
+        Generate wordcloud from comments data
+        Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
+        """
+        if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
+            return
+
+        if not self.wordcloud_generator:
+            return
+
+        try:
+            # Read comments from JSON file
+            comments_file_path = self._get_file_path('json', 'comments')
+            if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
+                return
+
+            async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
+                content = await f.read()
+                if not content:
+                    utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
+                    return
+
+                comments_data = json.loads(content)
+                if not isinstance(comments_data, list):
+                    comments_data = [comments_data]
+
+            # Filter comments data to only include 'content' field
+            # Handle different comment data structures across platforms
+            filtered_data = []
+            for comment in comments_data:
+                if isinstance(comment, dict):
+                    # Try different possible content field names
+                    content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
+                    if content_text:
+                        filtered_data.append({'content': content_text})
+
+            if not filtered_data:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
+                return
+
+            # Generate wordcloud
+            words_base_path = f"data/{self.platform}/words"
+            pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
+            words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
+
+            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
+            await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
+            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
+
+        except Exception as e:
+            utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -26,6 +26,10 @@ def init_loging_config():
    )
    _logger = logging.getLogger("MediaCrawler")
    _logger.setLevel(level)
+
+    # 关闭 httpx 的 INFO 日志
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
    return _logger