// ai/metrics/prometheus.go
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
// 请求延迟分布
ChatLatency = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "chat_latency_seconds",
Help: "AI chat request latency in seconds",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
[]string{"agent_type", "intent", "status"}, // labels
)
// 请求总数
ChatTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "chat_total",
Help: "Total number of AI chat requests",
},
[]string{"agent_type", "intent", "status"},
)
// 工具调用
ToolCallTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "tool_call_total",
Help: "Total number of tool calls",
},
[]string{"tool_name", "success"},
)
ToolCallDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "tool_call_duration_seconds",
Help: "Tool call duration in seconds",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1},
},
[]string{"tool_name"},
)
// 缓存指标
CacheHits = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "cache_hits_total",
Help: "Total number of cache hits",
},
[]string{"cache_layer"}, // "l1_exact", "l2_semantic", "l3_tool"
)
CacheMisses = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "cache_misses_total",
Help: "Total number of cache misses",
},
[]string{"cache_layer"},
)
// LLM 调用
LLMCallTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "llm_call_total",
Help: "Total number of LLM calls",
},
[]string{"purpose", "model", "success"},
)
LLMTokensUsed = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "llm_tokens_total",
Help: "Total LLM tokens used",
},
[]string{"purpose", "model", "type"}, // type: input/output
)
// 路由指标
RoutingDecision = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "divinesense",
Subsystem: "ai",
Name: "routing_decision_total",
Help: "Routing decisions by method",
},
[]string{"method", "agent_type"}, // method: cache/rule/history/ml/llm
)
)
func (p *AmazingParrot) ExecuteWithCallback(...) error {
startTime := time.Now()
status := "success"
defer func() {
duration := time.Since(startTime).Seconds()
metrics.ChatLatency.WithLabelValues(
p.Name(), plan.intent.String(), status,
).Observe(duration)
metrics.ChatTotal.WithLabelValues(
p.Name(), plan.intent.String(), status,
).Inc()
}()
// 缓存检查
if cached, found := p.cache.Get(key); found {
metrics.CacheHits.WithLabelValues("l1_exact").Inc()
return nil
}
metrics.CacheMisses.WithLabelValues("l1_exact").Inc()
// 工具调用
for _, tool := range tools {
toolStart := time.Now()
result, err := tool.Execute(ctx, input)
metrics.ToolCallDuration.WithLabelValues(
tool.Name(),
).Observe(time.Since(toolStart).Seconds())
success := "true"
if err != nil {
success = "false"
}
metrics.ToolCallTotal.WithLabelValues(tool.Name(), success).Inc()
}
// ...
}
问题描述
当前缺少 AI 模块的业务指标监控:
建议方案
核心指标定义
埋点示例
Grafana Dashboard
提供预配置的 Grafana 仪表板:
文件变更
ai/metrics/prometheus.go- 指标定义ai/agent/amazing_parrot.go- 埋点ai/agent/tools/*.go- 埋点ai/router/service.go- 路由指标deploy/grafana/ai-dashboard.json- Grafana 仪表板验收标准
预估工时
1-2 天
相关 Issue