Skip to content

[Bug] Gemma3n MM does not work with LoRA #8775

@lifuhuang

Description

@lifuhuang

Checklist

  • 1. I have searched related issues but cannot get the expected help.
  • 2. The bug has not been fixed in the latest version.
  • 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
  • 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
  • 5. Please use English, otherwise it will be closed.

Describe the bug

Image

Reproduction

server_process, port = launch_server_cmd(
    """
python3 -m sglang.launch_server --model-path google/gemma-3n-E4B-it \
    --enable-lora \
    --lora-paths lora0=ericrisco/medical-gemma-3n-lora \
    --max-loras-per-batch 1 --lora-backend triton \
    --disable-radix-cache
"""
)

wait_for_server(f"http://localhost/:{port}")
Logs:
INFO 08-04 11:54:36 [__init__.py:235] Automatically detected platform cuda.
/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/layers/quantization/awq.py:42: UserWarning: Using kernels directly from vllm. This might lead to performance degradation or missing functionalities as certain kernels may not be optimized. 
  warnings.warn(
WARNING:sglang.srt.server_args:Multimodal model: Dynamically adjusted --mem-fraction-static from: 0.778 to: 0.591.
[2025-08-04 11:54:36] server_args=ServerArgs(model_path='google/gemma-3n-E4B-it', tokenizer_path='google/gemma-3n-E4B-it', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=36230, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.59128, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, hybrid_kvcache_ratio=None, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, device='cuda', tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=300353499, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, served_model_name='google/gemma-3n-E4B-it', chat_template=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=True, max_lora_rank=None, lora_target_modules=None, lora_paths={'lora0': LoRARef(lora_id='a5557d0b6e3143e1a7eec6237d167fe9', lora_name='lora0', lora_path='ericrisco/medical-gemma-3n-lora')}, max_loras_per_batch=1, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_moe=False, enable_flashinfer_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', hicache_io_backend='', hicache_storage_backend=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=True, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, enable_triton_kernel_moe=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False, enable_pdmux=False, sm_group_num=3)
[2025-08-04 11:54:40] Inferred chat template from model path: gemma-it
INFO 08-04 11:54:43 [__init__.py:235] Automatically detected platform cuda.
INFO 08-04 11:54:43 [__init__.py:235] Automatically detected platform cuda.
/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/layers/quantization/awq.py:42: UserWarning: Using kernels directly from vllm. This might lead to performance degradation or missing functionalities as certain kernels may not be optimized. 
  warnings.warn(
/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/layers/quantization/awq.py:42: UserWarning: Using kernels directly from vllm. This might lead to performance degradation or missing functionalities as certain kernels may not be optimized. 
  warnings.warn(
[2025-08-04 11:54:47] Attention backend not explicitly specified. Use flashinfer backend by default.
[2025-08-04 11:54:47] Init torch distributed begin.
[2025-08-04 11:54:47] Init torch distributed ends. mem usage=0.00 GB
[2025-08-04 11:54:47] Load weight begin. avail mem=43.82 GB
[2025-08-04 11:54:48] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:22<01:07, 22.48s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:42<00:42, 21.20s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [01:20<00:28, 28.92s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:58<00:00, 32.49s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:58<00:00, 29.71s/it]

[2025-08-04 11:56:48] Load weight end. type=Gemma3nForConditionalGeneration, dtype=torch.bfloat16, avail mem=29.04 GB, mem usage=14.78 GB.
[2025-08-04 11:56:48] Using triton as backend of LoRA kernels.
[2025-08-04 11:56:49] Using model weights format ['*.safetensors']
[2025-08-04 11:56:50] No model.safetensors.index.json found in remote.
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 41.54it/s]

[2025-08-04 11:56:50] Scheduler hit an exception: Traceback (most recent call last):
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 2923, in run_scheduler_process
    scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 344, in __init__
    self.tp_worker = TpWorkerClass(
                     ^^^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/managers/tp_worker_overlap_thread.py", line 66, in __init__
    self.worker = TpModelWorker(
                  ^^^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/managers/tp_worker.py", line 81, in __init__
    self.model_runner = ModelRunner(
                        ^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 233, in __init__
    self.initialize(min_per_gpu_memory)
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 309, in initialize
    self.init_lora_manager()
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 886, in init_lora_manager
    self.lora_manager = LoRAManager(
                        ^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/lora/lora_manager.py", line 76, in __init__
    self.init_state(
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/lora/lora_manager.py", line 360, in init_state
    self.init_lora_modules()
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/lora/lora_manager.py", line 479, in init_lora_modules
    self.lora_modules[layer_id][module_name] = self.set_lora_module(
                                               ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/lora/lora_manager.py", line 449, in set_lora_module
    lora_module = get_lora_layer(module, self.lora_backend)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/adap/lib/python3.11/site-packages/sglang/srt/lora/layers.py", line 370, in get_lora_layer
    raise Exception(f"No corresponding LoRA layer supported for {type(layer)}.")
Exception: No corresponding LoRA layer supported for <class 'torch.nn.modules.container.Sequential'>.

[2025-08-04 11:56:50] Received sigquit from a child process. It usually means the child failed.

Environment

N/A

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions