Files
urldb/monitor/metrics.go

458 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package monitor
import (
"fmt"
"net/http"
"runtime"
"sync"
"time"
"github.com/ctwj/urldb/utils"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// Metrics 监控指标
type Metrics struct {
// HTTP请求指标
RequestsTotal *prometheus.CounterVec
RequestDuration *prometheus.HistogramVec
RequestSize *prometheus.SummaryVec
ResponseSize *prometheus.SummaryVec
// 数据库指标
DatabaseQueries *prometheus.CounterVec
DatabaseErrors *prometheus.CounterVec
DatabaseDuration *prometheus.HistogramVec
// 系统指标
MemoryUsage prometheus.Gauge
Goroutines prometheus.Gauge
GCStats *prometheus.CounterVec
// 业务指标
ResourcesCreated *prometheus.CounterVec
ResourcesViewed *prometheus.CounterVec
Searches *prometheus.CounterVec
Transfers *prometheus.CounterVec
// 错误指标
ErrorsTotal *prometheus.CounterVec
// 自定义指标
CustomCounters map[string]prometheus.Counter
CustomGauges map[string]prometheus.Gauge
mu sync.RWMutex
}
// MetricsConfig 监控配置
type MetricsConfig struct {
Enabled bool
ListenAddress string
MetricsPath string
Namespace string
Subsystem string
}
// DefaultMetricsConfig 默认监控配置
func DefaultMetricsConfig() *MetricsConfig {
return &MetricsConfig{
Enabled: true,
ListenAddress: ":9090",
MetricsPath: "/metrics",
Namespace: "urldb",
Subsystem: "api",
}
}
// GlobalMetrics 全局监控实例
var (
globalMetrics *Metrics
once sync.Once
)
// NewMetrics 创建新的监控指标
func NewMetrics(config *MetricsConfig) *Metrics {
if config == nil {
config = DefaultMetricsConfig()
}
namespace := config.Namespace
subsystem := config.Subsystem
m := &Metrics{
// HTTP请求指标
RequestsTotal: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
),
RequestDuration: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"method", "endpoint", "status"},
),
RequestSize: promauto.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "http_request_size_bytes",
Help: "HTTP request size in bytes",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"method", "endpoint"},
),
ResponseSize: promauto.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "http_response_size_bytes",
Help: "HTTP response size in bytes",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"method", "endpoint"},
),
// 数据库指标
DatabaseQueries: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "database",
Name: "queries_total",
Help: "Total number of database queries",
},
[]string{"table", "operation"},
),
DatabaseErrors: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "database",
Name: "errors_total",
Help: "Total number of database errors",
},
[]string{"table", "operation", "error"},
),
DatabaseDuration: promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: "database",
Name: "query_duration_seconds",
Help: "Database query duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"table", "operation"},
),
// 系统指标
MemoryUsage: promauto.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: "system",
Name: "memory_usage_bytes",
Help: "Current memory usage in bytes",
},
),
Goroutines: promauto.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: "system",
Name: "goroutines",
Help: "Number of goroutines",
},
),
GCStats: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "system",
Name: "gc_stats_total",
Help: "Garbage collection statistics",
},
[]string{"type"},
),
// 业务指标
ResourcesCreated: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "business",
Name: "resources_created_total",
Help: "Total number of resources created",
},
[]string{"category", "platform"},
),
ResourcesViewed: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "business",
Name: "resources_viewed_total",
Help: "Total number of resources viewed",
},
[]string{"category"},
),
Searches: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "business",
Name: "searches_total",
Help: "Total number of searches",
},
[]string{"platform"},
),
Transfers: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "business",
Name: "transfers_total",
Help: "Total number of transfers",
},
[]string{"platform", "status"},
),
// 错误指标
ErrorsTotal: promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: "errors",
Name: "total",
Help: "Total number of errors",
},
[]string{"type", "endpoint"},
),
// 自定义指标
CustomCounters: make(map[string]prometheus.Counter),
CustomGauges: make(map[string]prometheus.Gauge),
}
// 启动系统指标收集
go m.collectSystemMetrics()
return m
}
// GetGlobalMetrics 获取全局监控实例
func GetGlobalMetrics() *Metrics {
once.Do(func() {
globalMetrics = NewMetrics(DefaultMetricsConfig())
})
return globalMetrics
}
// SetGlobalMetrics 设置全局监控实例
func SetGlobalMetrics(metrics *Metrics) {
globalMetrics = metrics
}
// collectSystemMetrics 收集系统指标
func (m *Metrics) collectSystemMetrics() {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for range ticker.C {
// 收集内存使用情况
var ms runtime.MemStats
runtime.ReadMemStats(&ms)
m.MemoryUsage.Set(float64(ms.Alloc))
// 收集goroutine数量
m.Goroutines.Set(float64(runtime.NumGoroutine()))
// 收集GC统计
m.GCStats.WithLabelValues("alloc").Add(float64(ms.TotalAlloc))
m.GCStats.WithLabelValues("sys").Add(float64(ms.Sys))
m.GCStats.WithLabelValues("lookups").Add(float64(ms.Lookups))
m.GCStats.WithLabelValues("mallocs").Add(float64(ms.Mallocs))
m.GCStats.WithLabelValues("frees").Add(float64(ms.Frees))
}
}
// MetricsMiddleware 监控中间件
func (m *Metrics) MetricsMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
path := c.FullPath()
// 如果没有匹配的路由,使用请求路径
if path == "" {
path = c.Request.URL.Path
}
// 记录请求大小
requestSize := float64(c.Request.ContentLength)
m.RequestSize.WithLabelValues(c.Request.Method, path).Observe(requestSize)
c.Next()
// 记录响应信息
status := c.Writer.Status()
latency := time.Since(start).Seconds()
responseSize := float64(c.Writer.Size())
// 更新指标
m.RequestsTotal.WithLabelValues(c.Request.Method, path, fmt.Sprintf("%d", status)).Inc()
m.RequestDuration.WithLabelValues(c.Request.Method, path, fmt.Sprintf("%d", status)).Observe(latency)
m.ResponseSize.WithLabelValues(c.Request.Method, path).Observe(responseSize)
// 如果是错误状态码,记录错误
if status >= 400 {
m.ErrorsTotal.WithLabelValues("http", path).Inc()
}
}
}
// StartMetricsServer 启动监控服务器
func (m *Metrics) StartMetricsServer(config *MetricsConfig) {
if config == nil {
config = DefaultMetricsConfig()
}
if !config.Enabled {
utils.Info("监控服务器未启用")
return
}
// 创建新的Gin路由器
router := gin.New()
router.Use(gin.Recovery())
// 注册Prometheus指标端点
router.GET(config.MetricsPath, gin.WrapH(promhttp.Handler()))
// 启动HTTP服务器
go func() {
utils.Info("监控服务器启动在 %s", config.ListenAddress)
if err := router.Run(config.ListenAddress); err != nil {
utils.Error("监控服务器启动失败: %v", err)
}
}()
utils.Info("监控服务器已启动,指标路径: %s%s", config.ListenAddress, config.MetricsPath)
}
// IncrementDatabaseQuery 增加数据库查询计数
func (m *Metrics) IncrementDatabaseQuery(table, operation string) {
m.DatabaseQueries.WithLabelValues(table, operation).Inc()
}
// IncrementDatabaseError 增加数据库错误计数
func (m *Metrics) IncrementDatabaseError(table, operation, error string) {
m.DatabaseErrors.WithLabelValues(table, operation, error).Inc()
}
// ObserveDatabaseDuration 记录数据库查询耗时
func (m *Metrics) ObserveDatabaseDuration(table, operation string, duration float64) {
m.DatabaseDuration.WithLabelValues(table, operation).Observe(duration)
}
// IncrementResourceCreated 增加资源创建计数
func (m *Metrics) IncrementResourceCreated(category, platform string) {
m.ResourcesCreated.WithLabelValues(category, platform).Inc()
}
// IncrementResourceViewed 增加资源查看计数
func (m *Metrics) IncrementResourceViewed(category string) {
m.ResourcesViewed.WithLabelValues(category).Inc()
}
// IncrementSearch 增加搜索计数
func (m *Metrics) IncrementSearch(platform string) {
m.Searches.WithLabelValues(platform).Inc()
}
// IncrementTransfer 增加转存计数
func (m *Metrics) IncrementTransfer(platform, status string) {
m.Transfers.WithLabelValues(platform, status).Inc()
}
// IncrementError 增加错误计数
func (m *Metrics) IncrementError(errorType, endpoint string) {
m.ErrorsTotal.WithLabelValues(errorType, endpoint).Inc()
}
// AddCustomCounter 添加自定义计数器
func (m *Metrics) AddCustomCounter(name, help string, labels []string) prometheus.Counter {
m.mu.Lock()
defer m.mu.Unlock()
key := fmt.Sprintf("%s_%v", name, labels)
if counter, exists := m.CustomCounters[key]; exists {
return counter
}
counter := promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "urldb",
Name: name,
Help: help,
},
labels,
).WithLabelValues() // 如果没有标签,返回默认实例
m.CustomCounters[key] = counter
return counter
}
// AddCustomGauge 添加自定义仪表盘
func (m *Metrics) AddCustomGauge(name, help string, labels []string) prometheus.Gauge {
m.mu.Lock()
defer m.mu.Unlock()
key := fmt.Sprintf("%s_%v", name, labels)
if gauge, exists := m.CustomGauges[key]; exists {
return gauge
}
gauge := promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "urldb",
Name: name,
Help: help,
},
labels,
).WithLabelValues() // 如果没有标签,返回默认实例
m.CustomGauges[key] = gauge
return gauge
}
// GetMetricsSummary 获取指标摘要
func (m *Metrics) GetMetricsSummary() map[string]interface{} {
// 这里可以实现获取当前指标摘要的逻辑
// 由于Prometheus指标不能直接读取我们只能返回一些基本的统计信息
return map[string]interface{}{
"timestamp": time.Now(),
"status": "running",
"info": "使用 /metrics 端点获取详细指标",
}
}
// HealthCheck 健康检查
func (m *Metrics) HealthCheck(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{
"status": "healthy",
"timestamp": time.Now().Unix(),
"version": "1.0.0",
})
}
// SetupHealthCheck 设置健康检查端点
func (m *Metrics) SetupHealthCheck(router *gin.Engine) {
router.GET("/health", m.HealthCheck)
router.GET("/healthz", m.HealthCheck)
}
// MetricsHandler 指标处理器
func (m *Metrics) MetricsHandler() gin.HandlerFunc {
return gin.WrapH(promhttp.Handler())
}