[Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)

markmc · hmellor · web-flow · commit 5cc7c4452e48 · 2026-02-23T15:01:07.000Z
Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus.

`--enable-mfu-metrics` is required for these to be exposed.

Co-authored-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
Signed-off-by: Mark McLoughlin &lt;markmc@redhat.com&gt;
diff --git a/docs/mkdocs/hooks/generate_metrics.py b/docs/mkdocs/hooks/generate_metrics.py
@@ -22,6 +22,7 @@
         "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
         "output": "nixl_connector.inc.md",
     },
+    {"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
 ]
 
 
diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md
@@ -45,6 +45,12 @@ The following metrics are exposed:
 
 --8<-- "docs/generated/metrics/nixl_connector.inc.md"
 
+## Model Flops Utilization (MFU) Performance Metrics
+
+These metrics are available via `--enable-mfu-metrics`:
+
+--8<-- "docs/generated/metrics/perf.inc.md"
+
 ## Deprecation Policy
 
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -19,7 +19,7 @@
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
-from vllm.v1.metrics.perf import PerfMetricsLogging
+from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
     CachingMetrics,
@@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
     _histogram_cls = Histogram
     _spec_decoding_cls = SpecDecodingProm
     _kv_connector_cls = KVConnectorPrometheus
+    _perf_metrics_cls = PerfMetricsProm
 
     def __init__(
         self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
@@ -424,6 +425,9 @@ def __init__(
         self.kv_connector_prom = self._kv_connector_cls(
             vllm_config, labelnames, per_engine_labelvalues
         )
+        self.perf_metrics_prom = self._perf_metrics_cls(
+            vllm_config, labelnames, per_engine_labelvalues
+        )
 
         #
         # Scheduler state
@@ -1065,6 +1069,9 @@ def record(
                     scheduler_stats.kv_connector_stats, engine_idx
                 )
 
+            if scheduler_stats.perf_stats is not None:
+                self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx)
+
             if (
                 self.kv_cache_metrics_enabled
                 and scheduler_stats.kv_cache_eviction_events
diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py
@@ -13,6 +13,7 @@
 from dataclasses import asdict, dataclass
 from typing import Any, Protocol
 
+import prometheus_client
 import torch
 from pydantic import BaseModel, Field, ValidationError, model_validator
 from typing_extensions import Self
@@ -1233,6 +1234,87 @@ def log(self, log_fn=logger.info, log_prefix: str = "") -> None:
         self.reset()
 
 
+#### Prometheus Integration ####
+
+
+class PerfMetricsProm:
+    """Record performance metrics in Prometheus.
+
+    Average TFLOPS (tera floating-point operations per second) can be
+    calculated using a PromQL query:
+
+      rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12
+
+    Average memory bandwidth in GB/s can be calculated using:
+
+      (rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
+       rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
+    """
+
+    _counter_cls = prometheus_client.Counter
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        labelnames: list[str],
+        per_engine_labelvalues: dict[int, list[object]],
+    ):
+        counter_flops = self._counter_cls(
+            name="vllm:estimated_flops_per_gpu_total",
+            documentation=(
+                "Estimated number of floating point operations per GPU "
+                "(for Model Flops Utilization calculations)."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
+
+        counter_read_bytes = self._counter_cls(
+            name="vllm:estimated_read_bytes_per_gpu_total",
+            documentation=(
+                "Estimated number of bytes read from memory per GPU "
+                "(for Model Flops Utilization calculations)."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_read_bytes = make_per_engine(
+            counter_read_bytes, per_engine_labelvalues
+        )
+
+        counter_write_bytes = self._counter_cls(
+            name="vllm:estimated_write_bytes_per_gpu_total",
+            documentation=(
+                "Estimated number of bytes written to memory per GPU "
+                "(for Model Flops Utilization calculations)."
+            ),
+            labelnames=labelnames,
+        )
+        self.counter_write_bytes = make_per_engine(
+            counter_write_bytes, per_engine_labelvalues
+        )
+
+    def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
+        if not (
+            perf_stats.num_flops_per_gpu
+            or perf_stats.num_read_bytes_per_gpu
+            or perf_stats.num_write_bytes_per_gpu
+        ):
+            return
+        self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu)
+        self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu)
+        self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
+
+
+def make_per_engine(
+    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
+):
+    """Create a counter for each label value."""
+    return {
+        idx: counter.labels(*labelvalues)
+        for idx, labelvalues in per_engine_labelvalues.items()
+    }
+
+
 ## util functions
 
 
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
@@ -4,6 +4,7 @@
 
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
 from vllm.v1.metrics.loggers import PrometheusStatLogger
+from vllm.v1.metrics.perf import PerfMetricsProm
 from vllm.v1.spec_decode.metrics import SpecDecodingProm
 
 try:
@@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus):
     _histogram_cls = RayHistogramWrapper
 
 
+class RayPerfMetricsProm(PerfMetricsProm):
+    """
+    RayPerfMetricsProm is used by RayMetrics to log Ray
+    metrics. Provides the same MFU metrics as PerfMetricsProm
+    uses Ray's util.metrics library.
+    """
+
+    _counter_cls = RayCounterWrapper
+
+
 class RayPrometheusStatLogger(PrometheusStatLogger):
     """RayPrometheusStatLogger uses Ray metrics instead."""
 
@@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
     _histogram_cls = RayHistogramWrapper
     _spec_decoding_cls = RaySpecDecodingProm
     _kv_connector_cls = RayKVConnectorPrometheus
+    _perf_metrics_cls = RayPerfMetricsProm
 
     @staticmethod
     def _unregister_vllm_metrics():

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",`
`23`	`23`	`"output": "nixl_connector.inc.md",`
`24`	`24`	`},`
	`25`	`+ {"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},`
`25`	`26`	`]`
`26`	`27`
`27`	`28`