mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 03:15:17 +08:00
fix: bili词云图修复
This commit is contained in:
14
main.py
14
main.py
@@ -24,6 +24,8 @@ from media_platform.tieba import TieBaCrawler
|
||||
from media_platform.weibo import WeiboCrawler
|
||||
from media_platform.xhs import XiaoHongShuCrawler
|
||||
from media_platform.zhihu import ZhihuCrawler
|
||||
from tools.async_file_writer import AsyncFileWriter
|
||||
from var import crawler_type_var
|
||||
|
||||
|
||||
class CrawlerFactory:
|
||||
@@ -72,6 +74,18 @@ async def main():
|
||||
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
||||
await crawler.start()
|
||||
|
||||
# Generate wordcloud after crawling is complete
|
||||
# Only for JSON save mode
|
||||
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
file_writer = AsyncFileWriter(
|
||||
platform=config.PLATFORM,
|
||||
crawler_type=crawler_type_var.get()
|
||||
)
|
||||
await file_writer.generate_wordcloud_from_comments()
|
||||
except Exception as e:
|
||||
print(f"Error generating wordcloud: {e}")
|
||||
|
||||
|
||||
def cleanup():
|
||||
if crawler:
|
||||
|
||||
@@ -37,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore):
|
||||
def __init__(self):
|
||||
self.file_writer = AsyncFileWriter(
|
||||
crawler_type=crawler_type_var.get(),
|
||||
platform="bilibili"
|
||||
platform="bili"
|
||||
)
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
@@ -220,7 +220,7 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
def __init__(self):
|
||||
self.file_writer = AsyncFileWriter(
|
||||
crawler_type=crawler_type_var.get(),
|
||||
platform="bilibili"
|
||||
platform="bili"
|
||||
)
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
|
||||
@@ -22,7 +22,7 @@ from tools import utils
|
||||
|
||||
|
||||
class BilibiliVideo(AbstractStoreVideo):
|
||||
video_store_path: str = "data/bilibili/videos"
|
||||
video_store_path: str = "data/bili/videos"
|
||||
|
||||
async def store_video(self, video_content_item: Dict):
|
||||
"""
|
||||
|
||||
@@ -5,13 +5,16 @@ import os
|
||||
import pathlib
|
||||
from typing import Dict, List
|
||||
import aiofiles
|
||||
import config
|
||||
from tools.utils import utils
|
||||
from tools.words import AsyncWordCloudGenerator
|
||||
|
||||
class AsyncFileWriter:
|
||||
def __init__(self, platform: str, crawler_type: str):
|
||||
self.lock = asyncio.Lock()
|
||||
self.platform = platform
|
||||
self.crawler_type = crawler_type
|
||||
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
|
||||
|
||||
def _get_file_path(self, file_type: str, item_type: str) -> str:
|
||||
base_path = f"data/{self.platform}/{file_type}"
|
||||
@@ -47,4 +50,58 @@ class AsyncFileWriter:
|
||||
existing_data.append(item)
|
||||
|
||||
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
|
||||
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
|
||||
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
|
||||
|
||||
async def generate_wordcloud_from_comments(self):
|
||||
"""
|
||||
Generate wordcloud from comments data
|
||||
Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
|
||||
"""
|
||||
if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
|
||||
return
|
||||
|
||||
if not self.wordcloud_generator:
|
||||
return
|
||||
|
||||
try:
|
||||
# Read comments from JSON file
|
||||
comments_file_path = self._get_file_path('json', 'comments')
|
||||
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
|
||||
return
|
||||
|
||||
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
|
||||
content = await f.read()
|
||||
if not content:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
|
||||
return
|
||||
|
||||
comments_data = json.loads(content)
|
||||
if not isinstance(comments_data, list):
|
||||
comments_data = [comments_data]
|
||||
|
||||
# Filter comments data to only include 'content' field
|
||||
# Handle different comment data structures across platforms
|
||||
filtered_data = []
|
||||
for comment in comments_data:
|
||||
if isinstance(comment, dict):
|
||||
# Try different possible content field names
|
||||
content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
|
||||
if content_text:
|
||||
filtered_data.append({'content': content_text})
|
||||
|
||||
if not filtered_data:
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
|
||||
return
|
||||
|
||||
# Generate wordcloud
|
||||
words_base_path = f"data/{self.platform}/words"
|
||||
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
|
||||
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
|
||||
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
|
||||
await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
|
||||
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")
|
||||
@@ -26,6 +26,10 @@ def init_loging_config():
|
||||
)
|
||||
_logger = logging.getLogger("MediaCrawler")
|
||||
_logger.setLevel(level)
|
||||
|
||||
# 关闭 httpx 的 INFO 日志
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
|
||||
return _logger
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user