Skip to content

[AI优化] Prometheus 指标增强 - AI 模块业务监控 #100

@hrygo

Description

@hrygo

问题描述

当前缺少 AI 模块的业务指标监控:

  • 无法实时了解 AI 功能的使用情况
  • 性能问题难以发现
  • 缓存效果无法量化
  • 工具调用成功率未跟踪

建议方案

核心指标定义

// ai/metrics/prometheus.go
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    // 请求延迟分布
    ChatLatency = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "chat_latency_seconds",
            Help:      "AI chat request latency in seconds",
            Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
        },
        []string{"agent_type", "intent", "status"}, // labels
    )
    
    // 请求总数
    ChatTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "chat_total",
            Help:      "Total number of AI chat requests",
        },
        []string{"agent_type", "intent", "status"},
    )
    
    // 工具调用
    ToolCallTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "tool_call_total",
            Help:      "Total number of tool calls",
        },
        []string{"tool_name", "success"},
    )
    
    ToolCallDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "tool_call_duration_seconds",
            Help:      "Tool call duration in seconds",
            Buckets:   []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1},
        },
        []string{"tool_name"},
    )
    
    // 缓存指标
    CacheHits = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "cache_hits_total",
            Help:      "Total number of cache hits",
        },
        []string{"cache_layer"}, // "l1_exact", "l2_semantic", "l3_tool"
    )
    
    CacheMisses = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "cache_misses_total",
            Help:      "Total number of cache misses",
        },
        []string{"cache_layer"},
    )
    
    // LLM 调用
    LLMCallTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "llm_call_total",
            Help:      "Total number of LLM calls",
        },
        []string{"purpose", "model", "success"},
    )
    
    LLMTokensUsed = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "llm_tokens_total",
            Help:      "Total LLM tokens used",
        },
        []string{"purpose", "model", "type"}, // type: input/output
    )
    
    // 路由指标
    RoutingDecision = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Namespace: "divinesense",
            Subsystem: "ai",
            Name:      "routing_decision_total",
            Help:      "Routing decisions by method",
        },
        []string{"method", "agent_type"}, // method: cache/rule/history/ml/llm
    )
)

埋点示例

func (p *AmazingParrot) ExecuteWithCallback(...) error {
    startTime := time.Now()
    status := "success"
    
    defer func() {
        duration := time.Since(startTime).Seconds()
        metrics.ChatLatency.WithLabelValues(
            p.Name(), plan.intent.String(), status,
        ).Observe(duration)
        metrics.ChatTotal.WithLabelValues(
            p.Name(), plan.intent.String(), status,
        ).Inc()
    }()
    
    // 缓存检查
    if cached, found := p.cache.Get(key); found {
        metrics.CacheHits.WithLabelValues("l1_exact").Inc()
        return nil
    }
    metrics.CacheMisses.WithLabelValues("l1_exact").Inc()
    
    // 工具调用
    for _, tool := range tools {
        toolStart := time.Now()
        result, err := tool.Execute(ctx, input)
        
        metrics.ToolCallDuration.WithLabelValues(
            tool.Name(),
        ).Observe(time.Since(toolStart).Seconds())
        
        success := "true"
        if err != nil {
            success = "false"
        }
        metrics.ToolCallTotal.WithLabelValues(tool.Name(), success).Inc()
    }
    
    // ...
}

Grafana Dashboard

提供预配置的 Grafana 仪表板:

  • 概览面板: 请求量、延迟 P50/P95/P99、错误率
  • 工具面板: 各工具调用量、成功率、延迟
  • 缓存面板: 各层缓存命中率趋势
  • LLM 面板: Token 使用量、成本估算

文件变更

  • ai/metrics/prometheus.go - 指标定义
  • ai/agent/amazing_parrot.go - 埋点
  • ai/agent/tools/*.go - 埋点
  • ai/router/service.go - 路由指标
  • deploy/grafana/ai-dashboard.json - Grafana 仪表板

验收标准

  • 覆盖核心业务指标
  • /metrics 端点正确暴露
  • Grafana 仪表板可导入使用
  • 指标命名符合 Prometheus 规范
  • 文档说明各指标含义

预估工时

1-2 天

相关 Issue

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions