mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2025-11-25 11:29:27 +08:00
优化mongodb配置获取逻辑,移动存储基类位置。
This commit is contained in:
@@ -1,8 +1,5 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""MongoDB存储基类:提供连接管理和通用存储方法"""
|
||||||
MongoDB存储基类
|
|
||||||
提供MongoDB连接管理和通用存储方法
|
|
||||||
"""
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection
|
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection
|
||||||
@@ -11,7 +8,7 @@ from tools import utils
|
|||||||
|
|
||||||
|
|
||||||
class MongoDBConnection:
|
class MongoDBConnection:
|
||||||
"""MongoDB连接管理单例类"""
|
"""MongoDB连接管理(单例模式)"""
|
||||||
_instance = None
|
_instance = None
|
||||||
_client: Optional[AsyncIOMotorClient] = None
|
_client: Optional[AsyncIOMotorClient] = None
|
||||||
_db: Optional[AsyncIOMotorDatabase] = None
|
_db: Optional[AsyncIOMotorDatabase] = None
|
||||||
@@ -23,7 +20,7 @@ class MongoDBConnection:
|
|||||||
return cls._instance
|
return cls._instance
|
||||||
|
|
||||||
async def get_client(self) -> AsyncIOMotorClient:
|
async def get_client(self) -> AsyncIOMotorClient:
|
||||||
"""获取MongoDB客户端"""
|
"""获取客户端"""
|
||||||
if self._client is None:
|
if self._client is None:
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
if self._client is None:
|
if self._client is None:
|
||||||
@@ -31,7 +28,7 @@ class MongoDBConnection:
|
|||||||
return self._client
|
return self._client
|
||||||
|
|
||||||
async def get_db(self) -> AsyncIOMotorDatabase:
|
async def get_db(self) -> AsyncIOMotorDatabase:
|
||||||
"""获取MongoDB数据库"""
|
"""获取数据库"""
|
||||||
if self._db is None:
|
if self._db is None:
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
if self._db is None:
|
if self._db is None:
|
||||||
@@ -39,135 +36,93 @@ class MongoDBConnection:
|
|||||||
return self._db
|
return self._db
|
||||||
|
|
||||||
async def _connect(self):
|
async def _connect(self):
|
||||||
"""建立MongoDB连接"""
|
"""建立连接"""
|
||||||
try:
|
try:
|
||||||
mongo_config = db_config.mongodb_config
|
mongo_config = db_config.mongodb_config
|
||||||
host = mongo_config.get("host", "localhost")
|
host = mongo_config["host"]
|
||||||
port = mongo_config.get("port", 27017)
|
port = mongo_config["port"]
|
||||||
user = mongo_config.get("user", "")
|
user = mongo_config["user"]
|
||||||
password = mongo_config.get("password", "")
|
password = mongo_config["password"]
|
||||||
db_name = mongo_config.get("db_name", "media_crawler")
|
db_name = mongo_config["db_name"]
|
||||||
|
|
||||||
# 构建连接URL
|
# 构建连接URL(有认证/无认证)
|
||||||
if user and password:
|
if user and password:
|
||||||
connection_url = f"mongodb://{user}:{password}@{host}:{port}/"
|
connection_url = f"mongodb://{user}:{password}@{host}:{port}/"
|
||||||
else:
|
else:
|
||||||
connection_url = f"mongodb://{host}:{port}/"
|
connection_url = f"mongodb://{host}:{port}/"
|
||||||
|
|
||||||
self._client = AsyncIOMotorClient(connection_url, serverSelectionTimeoutMS=5000)
|
self._client = AsyncIOMotorClient(connection_url, serverSelectionTimeoutMS=5000)
|
||||||
# 测试连接
|
await self._client.server_info() # 测试连接
|
||||||
await self._client.server_info()
|
|
||||||
self._db = self._client[db_name]
|
self._db = self._client[db_name]
|
||||||
utils.logger.info(f"[MongoDBConnection] Successfully connected to MongoDB at {host}:{port}, database: {db_name}")
|
utils.logger.info(f"[MongoDBConnection] Connected to {host}:{port}/{db_name}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[MongoDBConnection] Failed to connect to MongoDB: {e}")
|
utils.logger.error(f"[MongoDBConnection] Connection failed: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""关闭MongoDB连接"""
|
"""关闭连接"""
|
||||||
if self._client is not None:
|
if self._client is not None:
|
||||||
self._client.close()
|
self._client.close()
|
||||||
self._client = None
|
self._client = None
|
||||||
self._db = None
|
self._db = None
|
||||||
utils.logger.info("[MongoDBConnection] MongoDB connection closed")
|
utils.logger.info("[MongoDBConnection] Connection closed")
|
||||||
|
|
||||||
|
|
||||||
class MongoDBStoreBase:
|
class MongoDBStoreBase:
|
||||||
"""MongoDB存储基类"""
|
"""MongoDB存储基类:提供通用的CRUD操作"""
|
||||||
|
|
||||||
def __init__(self, collection_prefix: str):
|
def __init__(self, collection_prefix: str):
|
||||||
"""
|
"""初始化存储基类
|
||||||
初始化MongoDB存储基类
|
|
||||||
Args:
|
Args:
|
||||||
collection_prefix: 集合名称前缀(如:xhs, douyin, bilibili等)
|
collection_prefix: 平台前缀(xhs/douyin/bilibili等)
|
||||||
"""
|
"""
|
||||||
self.collection_prefix = collection_prefix
|
self.collection_prefix = collection_prefix
|
||||||
self._connection = MongoDBConnection()
|
self._connection = MongoDBConnection()
|
||||||
|
|
||||||
async def get_collection(self, collection_suffix: str) -> AsyncIOMotorCollection:
|
async def get_collection(self, collection_suffix: str) -> AsyncIOMotorCollection:
|
||||||
"""
|
"""获取集合:{prefix}_{suffix}"""
|
||||||
获取MongoDB集合
|
|
||||||
Args:
|
|
||||||
collection_suffix: 集合名称后缀(如:contents, comments, creators)
|
|
||||||
Returns:
|
|
||||||
MongoDB集合对象
|
|
||||||
"""
|
|
||||||
db = await self._connection.get_db()
|
db = await self._connection.get_db()
|
||||||
collection_name = f"{self.collection_prefix}_{collection_suffix}"
|
collection_name = f"{self.collection_prefix}_{collection_suffix}"
|
||||||
return db[collection_name]
|
return db[collection_name]
|
||||||
|
|
||||||
async def save_or_update(self, collection_suffix: str, query: Dict, data: Dict) -> bool:
|
async def save_or_update(self, collection_suffix: str, query: Dict, data: Dict) -> bool:
|
||||||
"""
|
"""保存或更新数据(upsert)"""
|
||||||
保存或更新数据(upsert操作)
|
|
||||||
Args:
|
|
||||||
collection_suffix: 集合名称后缀
|
|
||||||
query: 查询条件
|
|
||||||
data: 要保存的数据
|
|
||||||
Returns:
|
|
||||||
是否成功
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
collection = await self.get_collection(collection_suffix)
|
collection = await self.get_collection(collection_suffix)
|
||||||
result = await collection.update_one(
|
await collection.update_one(query, {"$set": data}, upsert=True)
|
||||||
query,
|
|
||||||
{"$set": data},
|
|
||||||
upsert=True
|
|
||||||
)
|
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[MongoDBStoreBase.save_or_update] Failed to save data to {self.collection_prefix}_{collection_suffix}: {e}")
|
utils.logger.error(f"[MongoDBStoreBase] Save failed ({self.collection_prefix}_{collection_suffix}): {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def find_one(self, collection_suffix: str, query: Dict) -> Optional[Dict]:
|
async def find_one(self, collection_suffix: str, query: Dict) -> Optional[Dict]:
|
||||||
"""
|
"""查询单条数据"""
|
||||||
查询单条数据
|
|
||||||
Args:
|
|
||||||
collection_suffix: 集合名称后缀
|
|
||||||
query: 查询条件
|
|
||||||
Returns:
|
|
||||||
查询结果
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
collection = await self.get_collection(collection_suffix)
|
collection = await self.get_collection(collection_suffix)
|
||||||
result = await collection.find_one(query)
|
return await collection.find_one(query)
|
||||||
return result
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[MongoDBStoreBase.find_one] Failed to query from {self.collection_prefix}_{collection_suffix}: {e}")
|
utils.logger.error(f"[MongoDBStoreBase] Find one failed ({self.collection_prefix}_{collection_suffix}): {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def find_many(self, collection_suffix: str, query: Dict, limit: int = 0) -> List[Dict]:
|
async def find_many(self, collection_suffix: str, query: Dict, limit: int = 0) -> List[Dict]:
|
||||||
"""
|
"""查询多条数据(limit=0表示不限制)"""
|
||||||
查询多条数据
|
|
||||||
Args:
|
|
||||||
collection_suffix: 集合名称后缀
|
|
||||||
query: 查询条件
|
|
||||||
limit: 限制返回数量,0表示不限制
|
|
||||||
Returns:
|
|
||||||
查询结果列表
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
collection = await self.get_collection(collection_suffix)
|
collection = await self.get_collection(collection_suffix)
|
||||||
cursor = collection.find(query)
|
cursor = collection.find(query)
|
||||||
if limit > 0:
|
if limit > 0:
|
||||||
cursor = cursor.limit(limit)
|
cursor = cursor.limit(limit)
|
||||||
results = await cursor.to_list(length=None)
|
return await cursor.to_list(length=None)
|
||||||
return results
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[MongoDBStoreBase.find_many] Failed to query from {self.collection_prefix}_{collection_suffix}: {e}")
|
utils.logger.error(f"[MongoDBStoreBase] Find many failed ({self.collection_prefix}_{collection_suffix}): {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def create_index(self, collection_suffix: str, keys: List[tuple], unique: bool = False):
|
async def create_index(self, collection_suffix: str, keys: List[tuple], unique: bool = False):
|
||||||
"""
|
"""创建索引:keys=[("field", 1)]"""
|
||||||
创建索引
|
|
||||||
Args:
|
|
||||||
collection_suffix: 集合名称后缀
|
|
||||||
keys: 索引键列表,例如:[("note_id", 1)]
|
|
||||||
unique: 是否创建唯一索引
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
collection = await self.get_collection(collection_suffix)
|
collection = await self.get_collection(collection_suffix)
|
||||||
await collection.create_index(keys, unique=unique)
|
await collection.create_index(keys, unique=unique)
|
||||||
utils.logger.info(f"[MongoDBStoreBase.create_index] Created index on {self.collection_prefix}_{collection_suffix}")
|
utils.logger.info(f"[MongoDBStoreBase] Index created on {self.collection_prefix}_{collection_suffix}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[MongoDBStoreBase.create_index] Failed to create index: {e}")
|
utils.logger.error(f"[MongoDBStoreBase] Create index failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
@@ -31,7 +31,7 @@ from database.models import BilibiliVideoComment, BilibiliVideo, BilibiliUpInfo,
|
|||||||
from tools.async_file_writer import AsyncFileWriter
|
from tools.async_file_writer import AsyncFileWriter
|
||||||
from tools import utils, words
|
from tools import utils, words
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
|
|
||||||
|
|
||||||
class BiliCsvStoreImplement(AbstractStore):
|
class BiliCsvStoreImplement(AbstractStore):
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from database.models import DouyinAweme, DouyinAwemeComment, DyCreator
|
|||||||
from tools import utils, words
|
from tools import utils, words
|
||||||
from tools.async_file_writer import AsyncFileWriter
|
from tools.async_file_writer import AsyncFileWriter
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
|
|
||||||
|
|
||||||
class DouyinCsvStoreImplement(AbstractStore):
|
class DouyinCsvStoreImplement(AbstractStore):
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ from database.db_session import get_session
|
|||||||
from database.models import KuaishouVideo, KuaishouVideoComment
|
from database.models import KuaishouVideo, KuaishouVideoComment
|
||||||
from tools import utils, words
|
from tools import utils, words
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
|
|
||||||
|
|
||||||
def calculate_number_of_files(file_store_path: str) -> int:
|
def calculate_number_of_files(file_store_path: str) -> int:
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from tools import utils, words
|
|||||||
from database.db_session import get_session
|
from database.db_session import get_session
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from tools.async_file_writer import AsyncFileWriter
|
from tools.async_file_writer import AsyncFileWriter
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
|
|
||||||
|
|
||||||
def calculate_number_of_files(file_store_path: str) -> int:
|
def calculate_number_of_files(file_store_path: str) -> int:
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from tools import utils, words
|
|||||||
from tools.async_file_writer import AsyncFileWriter
|
from tools.async_file_writer import AsyncFileWriter
|
||||||
from database.db_session import get_session
|
from database.db_session import get_session
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
|
|
||||||
|
|
||||||
def calculate_number_of_files(file_store_path: str) -> int:
|
def calculate_number_of_files(file_store_path: str) -> int:
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from database.models import XhsNote, XhsNoteComment, XhsCreator
|
|||||||
from tools.async_file_writer import AsyncFileWriter
|
from tools.async_file_writer import AsyncFileWriter
|
||||||
from tools.time_util import get_current_timestamp
|
from tools.time_util import get_current_timestamp
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
class XhsCsvStoreImplement(AbstractStore):
|
class XhsCsvStoreImplement(AbstractStore):
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from database.models import ZhihuContent, ZhihuComment, ZhihuCreator
|
|||||||
from tools import utils, words
|
from tools import utils, words
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
from tools.async_file_writer import AsyncFileWriter
|
from tools.async_file_writer import AsyncFileWriter
|
||||||
from store.mongodb_store_base import MongoDBStoreBase
|
from database.mongodb_store_base import MongoDBStoreBase
|
||||||
|
|
||||||
def calculate_number_of_files(file_store_path: str) -> int:
|
def calculate_number_of_files(file_store_path: str) -> int:
|
||||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||||
|
|||||||
Reference in New Issue
Block a user