python3 hf_ptq.py --pyt_ckpt_path meta-llama/Llama-4-Scout-17B-16E-Instruct --qformat fp8 --export_fmt hf --export_path Llama-4-Scout-17B-16E-Instruct-FP8 --trust_remote_code
python3 -m sglang.launch_server --model Llama-4-Scout-17B-16E-Instruct-FP8 --tp 8
[2025-05-27 21:20:09 TP7] Scheduler hit an exception: Traceback (most recent call last):
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2322, in run_scheduler_process
scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 280, in __init__
self.tp_worker = TpWorkerClass(
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 64, in __init__
self.worker = TpModelWorker(
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 78, in __init__
self.model_runner = ModelRunner(
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 233, in __init__
self.initialize(min_per_gpu_memory)
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 274, in initialize
self.load_model()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 541, in load_model
self.model = get_model(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/__init__.py", line 22, in get_model
return loader.load_model(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 376, in load_model
model = _initialize_model(
File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 153, in _initialize_model
return model_class(
File "/sgl-workspace/sglang/python/sglang/srt/models/llama4.py", line 526, in __init__
super().__init__(config, quant_config, prefix)
File "/sgl-workspace/sglang/python/sglang/srt/models/llama.py", line 413, in __init__
self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
File "/sgl-workspace/sglang/python/sglang/srt/models/llama4.py", line 537, in _init_model
return Llama4Model(config, quant_config=quant_config, prefix=prefix)
File "/sgl-workspace/sglang/python/sglang/srt/models/llama4.py", line 470, in __init__
self.layers = make_layers(
File "/sgl-workspace/sglang/python/sglang/srt/utils.py", line 465, in make_layers
+ [
File "/sgl-workspace/sglang/python/sglang/srt/utils.py", line 466, in <listcomp>
maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
File "/sgl-workspace/sglang/python/sglang/srt/models/llama4.py", line 472, in <lambda>
lambda idx, prefix: Llama4DecoderLayer(
File "/sgl-workspace/sglang/python/sglang/srt/models/llama4.py", line 372, in __init__
self.feed_forward = Llama4MoE(
File "/sgl-workspace/sglang/python/sglang/srt/models/llama4.py", line 107, in __init__
self.experts = FusedMoE(
File "/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 340, in __init__
assert self.quant_method is not None
AssertionError
Describe the bug
Steps/Code to reproduce bug
Expected behavior
System information
Click to expand: Python script to automatically collect system information