Add raw logits topk

zianglih · zianglih · commit 6ed9c53b9265 · 2026-03-03T16:41:30.000-08:00
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
@@ -29,6 +29,7 @@
 )
 
 import torch
+import torch.nn.functional as F
 
 try:
     from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx, routing
@@ -443,6 +444,25 @@ def scoring_func_impl(gating_output: torch.Tensor) -> torch.Tensor:
     return topk_weights, topk_ids
 
 
+def fused_topk_softmax_torch_raw_logits(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert (
+        hidden_states.shape[0] == gating_output.shape[0]
+    ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
+
+    _, topk_ids = torch.topk(gating_output, k=topk, dim=-1, sorted=False)
+    logits = gating_output.float()
+    topk_weights = logits.gather(1, topk_ids)
+    if renormalize:
+        topk_weights = F.softmax(topk_weights, dim=-1, dtype=torch.float32)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
 def fused_topk_cpu(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -1030,15 +1050,28 @@ def select_experts(
         )
     elif custom_routing_function is None:
         assert not apply_routed_scaling_factor_on_output, "Not implemented"
-        # Qwen3MOE uses fused_topk
-        topk_weights, topk_ids = fused_topk(
-            hidden_states=hidden_states,
-            gating_output=router_logits,
-            topk=num_routed_topk if _use_aiter else top_k,
-            renormalize=renormalize,
-            correction_bias=correction_bias,
-            scoring_func=scoring_func,
-        )
+        if (
+            get_moe_runner_backend().is_flashinfer_trtllm_routed()
+            and scoring_func == "softmax"
+            and correction_bias is None
+        ):
+            # flashinfer_trtllm_routed uses raw-logits topk
+            topk_weights, topk_ids = fused_topk_softmax_torch_raw_logits(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=num_routed_topk if _use_aiter else top_k,
+                renormalize=renormalize,
+            )
+        else:
+            # Qwen3MOE uses fused_topk
+            topk_weights, topk_ids = fused_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=num_routed_topk if _use_aiter else top_k,
+                renormalize=renormalize,
+                correction_bias=correction_bias,
+                scoring_func=scoring_func,
+            )
     else:
         assert (
             num_token_non_padded is None