Skip to content

Commit 3ccd7fa

Browse files
authored
[CI] Fix B200 CI (#13387)
1 parent 254f62d commit 3ccd7fa

4 files changed

Lines changed: 87 additions & 62 deletions

File tree

.github/workflows/pr-test.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,9 @@ jobs:
900900
RUNNER_LABELS: 4-gpu-b200
901901
strategy:
902902
fail-fast: false
903+
matrix:
904+
part: [0, 1]
905+
903906
steps:
904907
- name: Checkout code
905908
uses: actions/checkout@v4
@@ -917,10 +920,10 @@ jobs:
917920
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
918921
919922
- name: Run test
920-
timeout-minutes: 45
923+
timeout-minutes: 30
921924
run: |
922925
cd test/srt
923-
python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600
926+
python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
924927
925928
unit-test-backend-4-gpu-gb200:
926929
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels-arm]

test/srt/run_suite.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,15 +178,16 @@ class TestFile:
178178
TestFile("test_disaggregation_dp_attention.py", 155),
179179
],
180180
"per-commit-4-gpu-b200": [
181-
TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
181+
TestFile("test_deepseek_v3_fp4_4gpu.py", 1800),
182182
TestFile("test_flash_attention_4.py", 300),
183183
TestFile("test_gpt_oss_4gpu.py", 600),
184184
TestFile("test_llama31_fp4.py", 300),
185-
TestFile("test_eagle_infer_beta_dp_attention.py", 200),
185+
# TODO: Add it back after the bug is fixed
186+
# TestFile("test_eagle_infer_beta_dp_attention.py", 200),
186187
],
187188
"per-commit-4-gpu-gb200": [
188189
TestFile("test_cutedsl_moe.py", 300),
189-
TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
190+
TestFile("test_deepseek_v3_fp4_4gpu.py", 1800),
190191
# Disabled temporarily, see https://github.com/sgl-project/sglang/issues/12533
191192
# TestFile("test_deepseek_v3_cutedsl_4gpu.py", 3600),
192193
],
@@ -219,9 +220,10 @@ class TestFile:
219220
],
220221
"nightly-4-gpu-b200": [
221222
TestFile("nightly/test_flashinfer_trtllm_gen_moe_backend.py", 300),
222-
TestFile("test_fp4_moe.py", 300),
223223
TestFile("nightly/test_gpt_oss_4gpu_perf.py", 600),
224224
TestFile("nightly/test_flashinfer_trtllm_gen_attn_backend.py", 300),
225+
TestFile("test_deepseek_v3_fp4_cutlass_moe.py", 900),
226+
TestFile("test_fp4_moe.py", 300),
225227
],
226228
"nightly-8-gpu-b200": [],
227229
"nightly-4-gpu": [
@@ -299,6 +301,7 @@ class TestFile:
299301
TestFile("test_deepseek_chat_templates.py"),
300302
TestFile("test_disaggregation.py"),
301303
TestFile("test_double_sparsity.py"),
304+
TestFile("test_eagle_infer_beta_dp_attention.py"),
302305
TestFile("test_embedding_openai_server.py"),
303306
TestFile("test_enable_thinking.py"),
304307
TestFile("test_eplb.py"),

test/srt/test_deepseek_v3_fp4_4gpu.py

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
99
from sglang.test.send_one import BenchArgs, send_one_prompt
1010
from sglang.test.test_utils import (
11-
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
1211
DEFAULT_URL_FOR_TEST,
1312
CustomTestCase,
1413
is_in_ci,
@@ -17,6 +16,7 @@
1716
)
1817

1918
FULL_DEEPSEEK_V3_FP4_MODEL_PATH = "nvidia/DeepSeek-V3-0324-FP4"
19+
SERVER_LAUNCH_TIMEOUT = 1000
2020

2121

2222
class TestDeepseekV3FP4(CustomTestCase):
@@ -41,7 +41,7 @@ def setUpClass(cls):
4141
cls.process = popen_launch_server(
4242
cls.model,
4343
cls.base_url,
44-
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
44+
timeout=SERVER_LAUNCH_TIMEOUT,
4545
other_args=other_args,
4646
)
4747

@@ -114,7 +114,7 @@ def setUpClass(cls):
114114
cls.process = popen_launch_server(
115115
cls.model,
116116
cls.base_url,
117-
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
117+
timeout=SERVER_LAUNCH_TIMEOUT,
118118
other_args=other_args,
119119
)
120120

@@ -172,58 +172,5 @@ def test_bs_1_speed(self):
172172
self.assertGreater(speed, 150)
173173

174174

175-
class TestDeepseekV3FP4CutlassMoE(CustomTestCase):
176-
@classmethod
177-
def setUpClass(cls):
178-
cls.model = FULL_DEEPSEEK_V3_FP4_MODEL_PATH
179-
cls.base_url = DEFAULT_URL_FOR_TEST
180-
other_args = [
181-
"--tp",
182-
"4",
183-
"--ep",
184-
"4",
185-
"--attention-backend",
186-
"trtllm_mla",
187-
"--moe-runner-backend",
188-
"flashinfer_cutlass",
189-
"--quantization",
190-
"modelopt_fp4",
191-
"--model-loader-extra-config",
192-
'{"enable_multithread_load": true}',
193-
]
194-
cls.process = popen_launch_server(
195-
cls.model,
196-
cls.base_url,
197-
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
198-
other_args=other_args,
199-
)
200-
201-
@classmethod
202-
def tearDownClass(cls):
203-
kill_process_tree(cls.process.pid)
204-
205-
def test_a_gsm8k(
206-
self,
207-
): # Append an "a" to make this test run first (alphabetically) to warm up the server
208-
args = SimpleNamespace(
209-
num_shots=8,
210-
data_path=None,
211-
num_questions=1319,
212-
parallel=1319,
213-
max_new_tokens=512,
214-
host="http://127.0.0.1",
215-
port=int(self.base_url.split(":")[-1]),
216-
)
217-
metrics = run_eval_few_shot_gsm8k(args)
218-
print(f"{metrics=}")
219-
220-
if is_in_ci():
221-
write_github_step_summary(
222-
f"### test_gsm8k (deepseek-v3-fp4-cutlass-moe)\n"
223-
f'{metrics["accuracy"]=:.3f}\n'
224-
)
225-
self.assertGreater(metrics["accuracy"], 0.935)
226-
227-
228175
if __name__ == "__main__":
229176
unittest.main()
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import unittest
2+
from types import SimpleNamespace
3+
4+
from sglang.srt.utils import kill_process_tree
5+
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
6+
from sglang.test.test_utils import (
7+
DEFAULT_URL_FOR_TEST,
8+
CustomTestCase,
9+
is_in_ci,
10+
popen_launch_server,
11+
write_github_step_summary,
12+
)
13+
14+
FULL_DEEPSEEK_V3_FP4_MODEL_PATH = "nvidia/DeepSeek-V3-0324-FP4"
15+
SERVER_LAUNCH_TIMEOUT = 1000
16+
17+
18+
class TestDeepseekV3FP4CutlassMoE(CustomTestCase):
19+
@classmethod
20+
def setUpClass(cls):
21+
cls.model = FULL_DEEPSEEK_V3_FP4_MODEL_PATH
22+
cls.base_url = DEFAULT_URL_FOR_TEST
23+
other_args = [
24+
"--tp",
25+
"4",
26+
"--ep",
27+
"4",
28+
"--attention-backend",
29+
"trtllm_mla",
30+
"--moe-runner-backend",
31+
"flashinfer_cutlass",
32+
"--quantization",
33+
"modelopt_fp4",
34+
"--model-loader-extra-config",
35+
'{"enable_multithread_load": true}',
36+
]
37+
cls.process = popen_launch_server(
38+
cls.model,
39+
cls.base_url,
40+
timeout=SERVER_LAUNCH_TIMEOUT,
41+
other_args=other_args,
42+
)
43+
44+
@classmethod
45+
def tearDownClass(cls):
46+
kill_process_tree(cls.process.pid)
47+
48+
def test_a_gsm8k(
49+
self,
50+
): # Append an "a" to make this test run first (alphabetically) to warm up the server
51+
args = SimpleNamespace(
52+
num_shots=8,
53+
data_path=None,
54+
num_questions=1319,
55+
parallel=1319,
56+
max_new_tokens=512,
57+
host="http://127.0.0.1",
58+
port=int(self.base_url.split(":")[-1]),
59+
)
60+
metrics = run_eval_few_shot_gsm8k(args)
61+
print(f"{metrics=}")
62+
63+
if is_in_ci():
64+
write_github_step_summary(
65+
f"### test_gsm8k (deepseek-v3-fp4-cutlass-moe)\n"
66+
f'{metrics["accuracy"]=:.3f}\n'
67+
)
68+
self.assertGreater(metrics["accuracy"], 0.935)
69+
70+
71+
if __name__ == "__main__":
72+
unittest.main()

0 commit comments

Comments
 (0)