Support EPLB balancedness prometheus metric without GPU->CPU synchronize (sgl-project#15401)

fzyzcjy · web-flow · commit 88a405cc10c5 · 2025-12-18T22:24:23.000+08:00
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
@@ -268,6 +268,7 @@ class Envs:
     SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
     SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR = EnvStr("/tmp")
     SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL = EnvInt(0)
+    SGLANG_ENABLE_EPLB_BALANCEDNESS_METRIC = EnvBool(False)
 
     # TBO
     SGLANG_TBO_DEBUG = EnvBool(False)
diff --git a/python/sglang/srt/eplb/expert_distribution.py b/python/sglang/srt/eplb/expert_distribution.py
@@ -20,6 +20,7 @@
 from abc import ABC
 from collections import deque
 from contextlib import contextmanager
+from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
 
@@ -43,6 +44,14 @@
 _OutputMode = Literal["file", "object"]
 
 
+@dataclass
+class ExpertDistributionMetrics:
+    eplb_balancedness: torch.Tensor
+
+    def copy_to_cpu(self):
+        self.eplb_balancedness = self.eplb_balancedness.to("cpu", non_blocking=True)
+
+
 class ExpertDistributionRecorder(ABC):
     """Global expert distribution recording"""
 
@@ -78,7 +87,7 @@ def disable_this_region(self):
 
     @contextmanager
     def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
-        yield
+        yield {}
 
     def on_select_experts(self, topk_ids: torch.Tensor):
         pass
@@ -157,12 +166,13 @@ def with_debug_name(self, debug_name):
 
     @contextmanager
     def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
+        outputs = {}
         with self._current_forward_pass_id.with_value(forward_pass_id):
             self._on_forward_pass_start(forward_batch)
             try:
-                yield
+                yield outputs
             finally:
-                self._on_forward_pass_end(forward_pass_id)
+                self._on_forward_pass_end(forward_pass_id, outputs)
 
     @contextmanager
     def disable_this_region(self):
@@ -181,12 +191,14 @@ def _on_forward_pass_start(self, forward_batch: ForwardBatch):
             gatherer.reset()
             gatherer.on_forward_pass_start(forward_batch)
 
-    def _on_forward_pass_end(self, forward_pass_id: int):
+    def _on_forward_pass_end(self, forward_pass_id: int, outputs: Dict[str, Any]):
         if not self._recording:
             return
         for gatherer_key, gatherer in self._single_pass_gatherers.items():
             single_pass_data = gatherer.collect()
-            self._accumulator.append(forward_pass_id, gatherer_key, single_pass_data)
+            self._accumulator.append(
+                forward_pass_id, gatherer_key, single_pass_data, outputs
+            )
 
     def on_select_experts(self, topk_ids: torch.Tensor):
         self._on_hook("on_select_experts", topk_ids=topk_ids)
@@ -636,6 +648,7 @@ def append(
         forward_pass_id: int,
         gatherer_key: str,
         single_pass_data: Dict,
+        outputs: Dict[str, Any],
     ):
         pass
 
@@ -659,18 +672,19 @@ def __init__(self, *args, **kwargs):
             self._expert_dispatch_collector = ExpertDispatchCollector(
                 self._expert_location_metadata.ep_size
             )
-            self._collection_counter = 0
+            self._metric_heatmap_collection_counter = 0
 
     def append(
         self,
         forward_pass_id: int,
         gatherer_key: str,
         single_pass_data: Dict,
+        outputs: Dict[str, Any],
     ):
-        super().append(forward_pass_id, gatherer_key, single_pass_data)
+        super().append(forward_pass_id, gatherer_key, single_pass_data, outputs)
         if self._enable:
-            self._append_utilization_rate(
-                forward_pass_id, single_pass_data["global_physical_count"]
+            return self._append_utilization_rate(
+                forward_pass_id, single_pass_data["global_physical_count"], outputs
             )
 
     def reset(self):
@@ -679,7 +693,10 @@ def reset(self):
             self._history.clear()
 
     def _append_utilization_rate(
-        self, forward_pass_id: int, single_pass_global_physical_count: torch.Tensor
+        self,
+        forward_pass_id: int,
+        single_pass_global_physical_count: torch.Tensor,
+        outputs: Dict[str, Any],
     ):
         gpu_physical_count = compute_gpu_physical_count(
             single_pass_global_physical_count,
@@ -691,27 +708,37 @@ def _append_utilization_rate(
         )
 
         if self._rank == 0:
-            self._collect_metrics_if_needed(gpu_physical_count)
+            self._handle_metric_eplb_heatmap(gpu_physical_count)
 
-            utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
-            utilization_rate = torch.mean(utilization_rate_tensor).item()
-            self._history.append(utilization_rate)
-
-            gpu_physical_count_sum = gpu_physical_count.sum().item()
-
-            logger.info(
-                f"[Expert Balancedness] "
-                f"forward_pass_id={forward_pass_id} "
-                f"current_pass_balancedness={utilization_rate:.03f} "
-                f"{''.join(f'last_{size}_average_balancedness={value:.03f} ' for size, value in self._history.mean().items())} "
-                f"gpu_physical_count_sum={gpu_physical_count_sum}"
-                # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
+            utilization_rate_gpu = torch.mean(
+                compute_utilization_rate(gpu_physical_count)
             )
+            if envs.SGLANG_ENABLE_EPLB_BALANCEDNESS_METRIC.get():
+                print(f"hi {self._rank=} {utilization_rate_gpu=}")
+                outputs["metrics"] = ExpertDistributionMetrics(
+                    eplb_balancedness=utilization_rate_gpu,
+                )
+            else:
+                # TODO maybe refactor this part to also avoid a `.item()` gpu->cpu sync
+                utilization_rate_cpu = utilization_rate_gpu.item()
+                self._history.append(utilization_rate_cpu)
+
+                gpu_physical_count_sum = gpu_physical_count.sum().item()
+
+                logger.info(
+                    f"[Expert Balancedness] "
+                    f"forward_pass_id={forward_pass_id} "
+                    f"current_pass_balancedness={utilization_rate_cpu:.03f} "
+                    f"{''.join(f'last_{size}_average_balancedness={value:.03f} ' for size, value in self._history.mean().items())} "
+                    f"gpu_physical_count_sum={gpu_physical_count_sum}"
+                    # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
+                )
 
-    def _collect_metrics_if_needed(self, gpu_physical_count: torch.Tensor):
+    # TODO refactor
+    def _handle_metric_eplb_heatmap(self, gpu_physical_count: torch.Tensor):
         # sglang:eplb_gpu_physical_count metric is disabled if SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL <= 0
         interval = get_int_env_var("SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL", 0)
-        if interval > 0 and self._collection_counter % interval == 0:
+        if interval > 0 and self._metric_heatmap_collection_counter % interval == 0:
             for layer_idx in range(self._expert_location_metadata.num_layers):
                 count_of_layer = (
                     self._expert_dispatch_collector.eplb_gpu_physical_count.labels(
@@ -728,7 +755,7 @@ def _collect_metrics_if_needed(self, gpu_physical_count: torch.Tensor):
                     if count > 0:
                         count_of_layer._sum.inc(count * gpu_rank)
                         count_of_layer._buckets[gpu_rank].inc(count)
-        self._collection_counter += 1
+        self._metric_heatmap_collection_counter += 1
 
 
 class _DequeCollection:
@@ -767,8 +794,9 @@ def append(
         forward_pass_id: int,
         gatherer_key: str,
         single_pass_data: Dict,
+        outputs: Dict[str, Any],
     ):
-        super().append(forward_pass_id, gatherer_key, single_pass_data)
+        super().append(forward_pass_id, gatherer_key, single_pass_data, outputs)
 
         def _process_object(obj):
             if isinstance(obj, torch.Tensor):
@@ -824,8 +852,9 @@ def append(
         forward_pass_id: int,
         gatherer_key: str,
         single_pass_data: Dict,
+        outputs: Dict[str, Any],
     ):
-        super().append(forward_pass_id, gatherer_key, single_pass_data)
+        super().append(forward_pass_id, gatherer_key, single_pass_data, outputs)
         # Can optimize if overhead here is large
         self._global_physical_count_of_buffered_step.append(
             single_pass_data["global_physical_count"]
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -2221,6 +2221,7 @@ def process_batch_result(
                 if result.copy_done is not None:
                     result.copy_done.synchronize()
 
+        self.log_batch_result_stats(batch, result)
         self.maybe_send_health_check_signal()
 
     def maybe_send_health_check_signal(self):
diff --git a/python/sglang/srt/managers/scheduler_metrics_mixin.py b/python/sglang/srt/managers/scheduler_metrics_mixin.py
@@ -4,20 +4,21 @@
 import time
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, List, Optional, Union
 
 from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
 from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.environ import envs
 from sglang.srt.managers.io_struct import GetLoadReqInput, GetLoadReqOutput
 from sglang.srt.managers.schedule_policy import PrefillAdder
 from sglang.srt.managers.scheduler import Req, ScheduleBatch
+from sglang.srt.managers.utils import GenerationBatchResult
 from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
 from sglang.srt.utils import get_bool_env_var
 from sglang.srt.utils.device_timer import DeviceTimer
 
 if TYPE_CHECKING:
-    from sglang.srt.managers.scheduler import Scheduler
+    from sglang.srt.managers.scheduler import EmbeddingBatchResult, Scheduler
 
 logger = logging.getLogger(__name__)
 
@@ -395,6 +396,22 @@ def log_decode_stats(
             self._emit_kv_metrics()
         self._publish_kv_events()
 
+    def log_batch_result_stats(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+    ):
+        if not self.enable_metrics:
+            return
+        if not isinstance(result, GenerationBatchResult):
+            return
+
+        if (m := result.expert_distribution_metrics) is not None:
+            self.metrics_collector.increment_eplb_balancedness(
+                forward_mode=batch.forward_mode.name.lower(),
+                balancedness=m.eplb_balancedness.item(),
+            )
+
     def _emit_kv_metrics(self: Scheduler):
         if not self.enable_kv_cache_events:
             return
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
@@ -406,6 +406,7 @@ def forward_batch_generation(
             batch_result = GenerationBatchResult(
                 logits_output=logits_output,
                 can_run_cuda_graph=can_run_cuda_graph,
+                expert_distribution_metrics=out.expert_distribution_metrics,
             )
 
             if is_verify:
@@ -460,6 +461,7 @@ def sample_batch_func():
             return GenerationBatchResult(
                 pp_hidden_states_proxy_tensors=pp_proxy_tensors,
                 can_run_cuda_graph=can_run_cuda_graph,
+                expert_distribution_metrics=out.expert_distribution_metrics,
             )
 
     def forward_batch_split_prefill(self, batch: ScheduleBatch):
@@ -482,6 +484,7 @@ def forward_batch_split_prefill(self, batch: ScheduleBatch):
         batch_result = GenerationBatchResult(
             logits_output=logits_output,
             can_run_cuda_graph=can_run_cuda_graph,
+            expert_distribution_metrics=out.expert_distribution_metrics,
         )
         batch_result.next_token_ids = next_token_ids
         return batch_result
diff --git a/python/sglang/srt/managers/utils.py b/python/sglang/srt/managers/utils.py
@@ -6,6 +6,7 @@
 
 import torch
 
+from sglang.srt.eplb.expert_distribution import ExpertDistributionMetrics
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.overlap_utils import FutureIndices
 from sglang.srt.managers.schedule_batch import Req
@@ -44,6 +45,9 @@ class GenerationBatchResult:
     # relay path: forward stream -> next step forward
     next_draft_input: Optional[EagleDraftInput] = None
 
+    # metrics
+    expert_distribution_metrics: Optional[ExpertDistributionMetrics] = None
+
     def copy_to_cpu(self, return_logprob: bool):
         """Copy tensors to CPU in overlap scheduling.
         Only the tensors which are needed for processing results are copied,
@@ -67,6 +71,9 @@ def copy_to_cpu(self, return_logprob: bool):
         if self.accept_lens is not None:
             self.accept_lens = self.accept_lens.to("cpu", non_blocking=True)
 
+        if (x := self.expert_distribution_metrics) is not None:
+            x.copy_to_cpu()
+
         self.copy_done.record()
 
     @classmethod
diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py
@@ -19,6 +19,7 @@
 from typing import Dict, List, Optional, Union
 
 from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.environ import envs
 from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import get_bool_env_var
@@ -241,7 +242,7 @@ def __init__(
         labels: Dict[str, str],
     ) -> None:
         # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
-        from prometheus_client import Counter, Gauge, Histogram
+        from prometheus_client import Counter, Gauge, Histogram, Summary
 
         self.labels = labels
         self.last_log_time = time.perf_counter()
@@ -641,6 +642,15 @@ def __init__(
             labelnames=list(labels.keys()) + ["mode"],
         )
 
+        if (
+            labels["moe_ep_rank"] == 0
+        ) and envs.SGLANG_ENABLE_EPLB_BALANCEDNESS_METRIC.get():
+            self.eplb_balancedness = Summary(
+                name="sglang:eplb_balancedness",
+                documentation="Balancedness of MoE in expert parallelism.",
+                labelnames=list(labels.keys()) + ["forward_mode"],
+            )
+
         self.new_token_ratio = Gauge(
             name="sglang:new_token_ratio",
             documentation="The new token ratio.",
@@ -698,6 +708,13 @@ def increment_cuda_graph_pass(self, value: bool) -> None:
         mode = "decode_cuda_graph" if value else "decode_none"
         self.cuda_graph_passes_total.labels(**self.labels, mode=mode).inc(1)
 
+    def increment_eplb_balancedness(
+        self, forward_mode: str, balancedness: float
+    ) -> None:
+        self.eplb_balancedness.labels(**self.labels, forward_mode=forward_mode).observe(
+            balancedness
+        )
+
     def increment_realtime_tokens(
         self, prefill_compute_tokens=0, prefill_cache_tokens=0, decode_tokens=0
     ):
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -68,6 +68,7 @@
 from sglang.srt.environ import envs
 from sglang.srt.eplb.eplb_manager import EPLBManager
 from sglang.srt.eplb.expert_distribution import (
+    ExpertDistributionMetrics,
     ExpertDistributionRecorder,
     get_global_expert_distribution_recorder,
     set_global_expert_distribution_recorder,
@@ -272,6 +273,7 @@ def filter(self, record):
 class ModelRunnerOutput:
     logits_output: Union[LogitsProcessorOutput, PPProxyTensors]
     can_run_graph: bool
+    expert_distribution_metrics: Optional[ExpertDistributionMetrics] = None
 
 
 class ModelRunner:
@@ -2738,14 +2740,15 @@ def forward(
         with get_global_expert_distribution_recorder().with_forward_pass(
             self.forward_pass_id,
             forward_batch,
-        ):
+        ) as recorder_outputs:
             output = self._forward_raw(
                 forward_batch,
                 skip_attn_backend_init,
                 pp_proxy_tensors,
                 reinit_attn_backend,
                 split_forward_count,
             )
+        output.expert_distribution_metrics = recorder_outputs.get("metrics")
 
         if self.eplb_manager is not None:
             self.eplb_manager.on_forward_pass_end()