|
13 | 13 | from dataclasses import asdict, dataclass |
14 | 14 | from typing import Any, Protocol |
15 | 15 |
|
| 16 | +import prometheus_client |
16 | 17 | import torch |
17 | 18 | from pydantic import BaseModel, Field, ValidationError, model_validator |
18 | 19 | from typing_extensions import Self |
@@ -1233,6 +1234,87 @@ def log(self, log_fn=logger.info, log_prefix: str = "") -> None: |
1233 | 1234 | self.reset() |
1234 | 1235 |
|
1235 | 1236 |
|
| 1237 | +#### Prometheus Integration #### |
| 1238 | + |
| 1239 | + |
| 1240 | +class PerfMetricsProm: |
| 1241 | + """Record performance metrics in Prometheus. |
| 1242 | +
|
| 1243 | + Average TFLOPS (tera floating-point operations per second) can be |
| 1244 | + calculated using a PromQL query: |
| 1245 | +
|
| 1246 | + rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12 |
| 1247 | +
|
| 1248 | + Average memory bandwidth in GB/s can be calculated using: |
| 1249 | +
|
| 1250 | + (rate(vllm:estimated_read_bytes_per_gpu_total[1m]) + |
| 1251 | + rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9 |
| 1252 | + """ |
| 1253 | + |
| 1254 | + _counter_cls = prometheus_client.Counter |
| 1255 | + |
| 1256 | + def __init__( |
| 1257 | + self, |
| 1258 | + vllm_config: VllmConfig, |
| 1259 | + labelnames: list[str], |
| 1260 | + per_engine_labelvalues: dict[int, list[object]], |
| 1261 | + ): |
| 1262 | + counter_flops = self._counter_cls( |
| 1263 | + name="vllm:estimated_flops_per_gpu_total", |
| 1264 | + documentation=( |
| 1265 | + "Estimated number of floating point operations per GPU " |
| 1266 | + "(for Model Flops Utilization calculations)." |
| 1267 | + ), |
| 1268 | + labelnames=labelnames, |
| 1269 | + ) |
| 1270 | + self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues) |
| 1271 | + |
| 1272 | + counter_read_bytes = self._counter_cls( |
| 1273 | + name="vllm:estimated_read_bytes_per_gpu_total", |
| 1274 | + documentation=( |
| 1275 | + "Estimated number of bytes read from memory per GPU " |
| 1276 | + "(for Model Flops Utilization calculations)." |
| 1277 | + ), |
| 1278 | + labelnames=labelnames, |
| 1279 | + ) |
| 1280 | + self.counter_read_bytes = make_per_engine( |
| 1281 | + counter_read_bytes, per_engine_labelvalues |
| 1282 | + ) |
| 1283 | + |
| 1284 | + counter_write_bytes = self._counter_cls( |
| 1285 | + name="vllm:estimated_write_bytes_per_gpu_total", |
| 1286 | + documentation=( |
| 1287 | + "Estimated number of bytes written to memory per GPU " |
| 1288 | + "(for Model Flops Utilization calculations)." |
| 1289 | + ), |
| 1290 | + labelnames=labelnames, |
| 1291 | + ) |
| 1292 | + self.counter_write_bytes = make_per_engine( |
| 1293 | + counter_write_bytes, per_engine_labelvalues |
| 1294 | + ) |
| 1295 | + |
| 1296 | + def observe(self, perf_stats: PerfStats, engine_idx: int = 0): |
| 1297 | + if not ( |
| 1298 | + perf_stats.num_flops_per_gpu |
| 1299 | + or perf_stats.num_read_bytes_per_gpu |
| 1300 | + or perf_stats.num_write_bytes_per_gpu |
| 1301 | + ): |
| 1302 | + return |
| 1303 | + self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu) |
| 1304 | + self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu) |
| 1305 | + self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu) |
| 1306 | + |
| 1307 | + |
| 1308 | +def make_per_engine( |
| 1309 | + counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]] |
| 1310 | +): |
| 1311 | + """Create a counter for each label value.""" |
| 1312 | + return { |
| 1313 | + idx: counter.labels(*labelvalues) |
| 1314 | + for idx, labelvalues in per_engine_labelvalues.items() |
| 1315 | + } |
| 1316 | + |
| 1317 | + |
1236 | 1318 | ## util functions |
1237 | 1319 |
|
1238 | 1320 |
|
|
0 commit comments