2025-11-06 10:40:30 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
2025-11-18 12:24:02 +08:00
|
|
|
|
# Copyright (c) 2025 relakkes@gmail.com
|
|
|
|
|
|
#
|
|
|
|
|
|
# This file is part of MediaCrawler project.
|
|
|
|
|
|
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/database/mongodb_store_base.py
|
|
|
|
|
|
# GitHub: https://github.com/NanmiCoder
|
|
|
|
|
|
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
|
|
|
|
|
|
#
|
|
|
|
|
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
|
|
|
|
|
# 1. 不得用于任何商业用途。
|
|
|
|
|
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
|
|
|
|
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
|
|
|
|
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
|
|
|
|
|
# 5. 不得用于任何非法或不当的用途。
|
|
|
|
|
|
#
|
|
|
|
|
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
|
|
|
|
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
|
|
|
|
|
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""MongoDB存储基类:提供连接管理和通用存储方法"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
import asyncio
|
|
|
|
|
|
from typing import Dict, List, Optional
|
|
|
|
|
|
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection
|
|
|
|
|
|
from config import db_config
|
|
|
|
|
|
from tools import utils
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MongoDBConnection:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""MongoDB连接管理(单例模式)"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
_instance = None
|
|
|
|
|
|
_client: Optional[AsyncIOMotorClient] = None
|
|
|
|
|
|
_db: Optional[AsyncIOMotorDatabase] = None
|
|
|
|
|
|
_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
def __new__(cls):
|
|
|
|
|
|
if cls._instance is None:
|
|
|
|
|
|
cls._instance = super(MongoDBConnection, cls).__new__(cls)
|
|
|
|
|
|
return cls._instance
|
|
|
|
|
|
|
|
|
|
|
|
async def get_client(self) -> AsyncIOMotorClient:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""获取客户端"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
if self._client is None:
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
if self._client is None:
|
|
|
|
|
|
await self._connect()
|
|
|
|
|
|
return self._client
|
|
|
|
|
|
|
|
|
|
|
|
async def get_db(self) -> AsyncIOMotorDatabase:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""获取数据库"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
if self._db is None:
|
|
|
|
|
|
async with self._lock:
|
|
|
|
|
|
if self._db is None:
|
|
|
|
|
|
await self._connect()
|
|
|
|
|
|
return self._db
|
|
|
|
|
|
|
|
|
|
|
|
async def _connect(self):
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""建立连接"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
try:
|
|
|
|
|
|
mongo_config = db_config.mongodb_config
|
2025-11-07 17:42:28 +08:00
|
|
|
|
host = mongo_config["host"]
|
|
|
|
|
|
port = mongo_config["port"]
|
|
|
|
|
|
user = mongo_config["user"]
|
|
|
|
|
|
password = mongo_config["password"]
|
|
|
|
|
|
db_name = mongo_config["db_name"]
|
2025-11-06 10:40:30 +08:00
|
|
|
|
|
2025-11-07 17:42:28 +08:00
|
|
|
|
# 构建连接URL(有认证/无认证)
|
2025-11-06 10:40:30 +08:00
|
|
|
|
if user and password:
|
|
|
|
|
|
connection_url = f"mongodb://{user}:{password}@{host}:{port}/"
|
|
|
|
|
|
else:
|
|
|
|
|
|
connection_url = f"mongodb://{host}:{port}/"
|
|
|
|
|
|
|
|
|
|
|
|
self._client = AsyncIOMotorClient(connection_url, serverSelectionTimeoutMS=5000)
|
2025-11-07 17:42:28 +08:00
|
|
|
|
await self._client.server_info() # 测试连接
|
2025-11-06 10:40:30 +08:00
|
|
|
|
self._db = self._client[db_name]
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.info(f"[MongoDBConnection] Connected to {host}:{port}/{db_name}")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
except Exception as e:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.error(f"[MongoDBConnection] Connection failed: {e}")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
async def close(self):
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""关闭连接"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
if self._client is not None:
|
|
|
|
|
|
self._client.close()
|
|
|
|
|
|
self._client = None
|
|
|
|
|
|
self._db = None
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.info("[MongoDBConnection] Connection closed")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MongoDBStoreBase:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""MongoDB存储基类:提供通用的CRUD操作"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
|
|
|
|
|
|
def __init__(self, collection_prefix: str):
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""初始化存储基类
|
2025-11-06 10:40:30 +08:00
|
|
|
|
Args:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
collection_prefix: 平台前缀(xhs/douyin/bilibili等)
|
2025-11-06 10:40:30 +08:00
|
|
|
|
"""
|
|
|
|
|
|
self.collection_prefix = collection_prefix
|
|
|
|
|
|
self._connection = MongoDBConnection()
|
|
|
|
|
|
|
|
|
|
|
|
async def get_collection(self, collection_suffix: str) -> AsyncIOMotorCollection:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""获取集合:{prefix}_{suffix}"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
db = await self._connection.get_db()
|
|
|
|
|
|
collection_name = f"{self.collection_prefix}_{collection_suffix}"
|
|
|
|
|
|
return db[collection_name]
|
|
|
|
|
|
|
|
|
|
|
|
async def save_or_update(self, collection_suffix: str, query: Dict, data: Dict) -> bool:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""保存或更新数据(upsert)"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
try:
|
|
|
|
|
|
collection = await self.get_collection(collection_suffix)
|
2025-11-07 17:42:28 +08:00
|
|
|
|
await collection.update_one(query, {"$set": data}, upsert=True)
|
2025-11-06 10:40:30 +08:00
|
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.error(f"[MongoDBStoreBase] Save failed ({self.collection_prefix}_{collection_suffix}): {e}")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
async def find_one(self, collection_suffix: str, query: Dict) -> Optional[Dict]:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""查询单条数据"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
try:
|
|
|
|
|
|
collection = await self.get_collection(collection_suffix)
|
2025-11-07 17:42:28 +08:00
|
|
|
|
return await collection.find_one(query)
|
2025-11-06 10:40:30 +08:00
|
|
|
|
except Exception as e:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.error(f"[MongoDBStoreBase] Find one failed ({self.collection_prefix}_{collection_suffix}): {e}")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
async def find_many(self, collection_suffix: str, query: Dict, limit: int = 0) -> List[Dict]:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""查询多条数据(limit=0表示不限制)"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
try:
|
|
|
|
|
|
collection = await self.get_collection(collection_suffix)
|
|
|
|
|
|
cursor = collection.find(query)
|
|
|
|
|
|
if limit > 0:
|
|
|
|
|
|
cursor = cursor.limit(limit)
|
2025-11-07 17:42:28 +08:00
|
|
|
|
return await cursor.to_list(length=None)
|
2025-11-06 10:40:30 +08:00
|
|
|
|
except Exception as e:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.error(f"[MongoDBStoreBase] Find many failed ({self.collection_prefix}_{collection_suffix}): {e}")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
async def create_index(self, collection_suffix: str, keys: List[tuple], unique: bool = False):
|
2025-11-07 17:42:28 +08:00
|
|
|
|
"""创建索引:keys=[("field", 1)]"""
|
2025-11-06 10:40:30 +08:00
|
|
|
|
try:
|
|
|
|
|
|
collection = await self.get_collection(collection_suffix)
|
|
|
|
|
|
await collection.create_index(keys, unique=unique)
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.info(f"[MongoDBStoreBase] Index created on {self.collection_prefix}_{collection_suffix}")
|
2025-11-06 10:40:30 +08:00
|
|
|
|
except Exception as e:
|
2025-11-07 17:42:28 +08:00
|
|
|
|
utils.logger.error(f"[MongoDBStoreBase] Create index failed: {e}")
|