python3 -m sglang.launch_server \
--model-path unsloth/Meta-Llama-3.1-8B-Instruct \
--enable-lora \
--cuda-graph-max-bs 8 \
--max-loras-per-batch 3 \
--max-lora-rank 256 \
--lora-target-modules all \
--lora-paths '{"lora_name":"lora0","lora_path":"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16","pinned":true}' '{"lora_name":"lora1","lora_path":"algoprog/fact-generation-llama-3.1-8b-instruct-lora"}' lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora \
--log-level warning
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 124.02it/s]
Capturing batches (bs=8 avail_mem=21.36 GB): 0%| | 0/4 [00:00<?, ?it/s]
[2025-12-10 02:25:20] Scheduler hit an exception: Traceback (most recent call last):
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 2680, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 320, in __init__
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 245, in __init__
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 358, in __init__
self.initialize(min_per_gpu_memory)
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 510, in initialize
self.init_device_graphs()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 2446, in init_device_graphs
self.graph_runner = graph_runners[self.device](self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 352, in __init__
self.capture()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 507, in capture
_capture_one_stream()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 494, in _capture_one_stream
) = self.capture_one_batch_size(bs, forward, stream_idx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 698, in capture_one_batch_size
run_once()
File "/sgl-workspace/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py", line 685, in run_once
logits_output_or_pp_proxy_tensors = forward(
^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/models/llama.py", line 469, in forward
hidden_states = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/models/llama.py", line 326, in forward
hidden_states = self.embed_tokens(input_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/lora/layers.py", line 184, in forward
base_output = self.apply_lora(base_output, input_, batch_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/lora/layers.py", line 101, in apply_lora
lora_output = self.lora_backend.run_lora_b_sgemm(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/lora/backend/chunked_backend.py", line 58, in run_lora_b_sgemm
return chunked_sgmv_lora_expand_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/sgl-workspace/sglang/python/sglang/srt/lora/triton_ops/chunked_sgmv_expand.py", line 163, in chunked_sgmv_lora_expand_forward
assert x.is_contiguous()
^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_contiguous'
[2025-12-10 02:25:20] Received sigquit from a child process. It usually means the child failed.
Killed
python3 -m sglang.launch_server \
--model-path unsloth/Meta-Llama-3.1-8B-Instruct \
--enable-lora \
--cuda-graph-max-bs 8 \
--max-loras-per-batch 3 \
--max-lora-rank 256 \
--lora-target-modules all \
--lora-paths '{"lora_name":"lora0","lora_path":"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16","pinned":true}' '{"lora_name":"lora1","lora_path":"algoprog/fact-generation-llama-3.1-8b-instruct-lora"}' lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora \
--log-level warning
Checklist
Describe the bug
This command on LoRA is broken (it's run in the notebook), after #14177 and seen in https://github.com/sgl-project/sglang/actions/runs/20083442362/job/57615689992?pr=14764
Reproduction
Environment
Latest main