root@moirai-exp-mark-1:/sgl-workspace/sglang#  cd /sgl-workspace/sglang ; /usr/bin/env /bin/python3 /root/.vscode-server/extensions/ms-python.debugpy-2025.10.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 45519 -- -m sglang.launch_server --model /models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 --port=8080 --tp-size=8 --mem-fraction-static=0.85 --context-length=524288 --tool-call-parser=pythonic --chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja --attention-backend=fa3 --enable-multimodal 
[2025-07-28 18:36:28] server_args=ServerArgs(model_path='/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', tokenizer_path='/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=524288, is_embedding=False, enable_multimodal=True, revision=None, model_impl='auto', host='127.0.0.1', port=8080, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.85, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, hybrid_kvcache_ratio=None, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, device='cuda', tp_size=8, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=9684118, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, served_model_name='/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', chat_template='examples/chat_template/tool_chat_template_llama4_pythonic.jinja', completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser='pythonic', dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='fa3', sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_cutlass_moe=False, enable_flashinfer_trtllm_moe=False, enable_flashinfer_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', hicache_io_backend='', hicache_storage_backend=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, enable_triton_kernel_moe=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False, enable_pdmux=False, sm_group_num=3)
[2025-07-28 18:36:30] Loading chat template: examples/chat_template/tool_chat_template_llama4_pythonic.jinja
[2025-07-28 18:36:30] Detected chat template content format: openai
[2025-07-28 18:36:54 TP0] Init torch distributed begin.
[2025-07-28 18:37:00 TP0] sglang is using nccl==2.26.2
[2025-07-28 18:37:02 TP0] Init torch distributed ends. mem usage=1.23 GB
[2025-07-28 18:37:03 TP0] Load weight begin. avail mem=77.69 GB
Loading safetensors checkpoint shards:   0% Completed | 0/84 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   1% Completed | 1/84 [00:00<00:24,  3.36it/s]
Loading safetensors checkpoint shards:   2% Completed | 2/84 [00:00<00:24,  3.41it/s]
Loading safetensors checkpoint shards:   4% Completed | 3/84 [00:00<00:22,  3.59it/s]
Loading safetensors checkpoint shards:   5% Completed | 4/84 [00:01<00:21,  3.66it/s]
Loading safetensors checkpoint shards:   6% Completed | 5/84 [00:01<00:22,  3.48it/s]
Loading safetensors checkpoint shards:   7% Completed | 6/84 [00:01<00:22,  3.45it/s]
Loading safetensors checkpoint shards:   8% Completed | 7/84 [00:01<00:21,  3.59it/s]
Loading safetensors checkpoint shards:  10% Completed | 8/84 [00:02<00:22,  3.41it/s]
Loading safetensors checkpoint shards:  11% Completed | 9/84 [00:02<00:21,  3.54it/s]
Loading safetensors checkpoint shards:  12% Completed | 10/84 [00:02<00:20,  3.59it/s]
Loading safetensors checkpoint shards:  13% Completed | 11/84 [00:03<00:19,  3.67it/s]
Loading safetensors checkpoint shards:  14% Completed | 12/84 [00:03<00:20,  3.50it/s]
Loading safetensors checkpoint shards:  15% Completed | 13/84 [00:03<00:20,  3.42it/s]
Loading safetensors checkpoint shards:  17% Completed | 14/84 [00:04<00:21,  3.32it/s]
Loading safetensors checkpoint shards:  18% Completed | 15/84 [00:04<00:21,  3.25it/s]
Loading safetensors checkpoint shards:  19% Completed | 16/84 [00:04<00:21,  3.15it/s]
Loading safetensors checkpoint shards:  20% Completed | 17/84 [00:05<00:22,  3.04it/s]
Loading safetensors checkpoint shards:  21% Completed | 18/84 [00:05<00:20,  3.22it/s]
Loading safetensors checkpoint shards:  23% Completed | 19/84 [00:05<00:19,  3.36it/s]
Loading safetensors checkpoint shards:  24% Completed | 20/84 [00:05<00:18,  3.46it/s]
Loading safetensors checkpoint shards:  25% Completed | 21/84 [00:06<00:18,  3.37it/s]
Loading safetensors checkpoint shards:  26% Completed | 22/84 [00:06<00:18,  3.29it/s]
Loading safetensors checkpoint shards:  27% Completed | 23/84 [00:06<00:18,  3.39it/s]
Loading safetensors checkpoint shards:  29% Completed | 24/84 [00:07<00:17,  3.47it/s]
Loading safetensors checkpoint shards:  30% Completed | 25/84 [00:07<00:16,  3.55it/s]
Loading safetensors checkpoint shards:  31% Completed | 26/84 [00:07<00:16,  3.60it/s]
Loading safetensors checkpoint shards:  32% Completed | 27/84 [00:07<00:15,  3.63it/s]
Loading safetensors checkpoint shards:  33% Completed | 28/84 [00:08<00:15,  3.68it/s]
Loading safetensors checkpoint shards:  35% Completed | 29/84 [00:08<00:14,  3.70it/s]
Loading safetensors checkpoint shards:  36% Completed | 30/84 [00:08<00:14,  3.70it/s]
Loading safetensors checkpoint shards:  37% Completed | 31/84 [00:08<00:14,  3.71it/s]
Loading safetensors checkpoint shards:  38% Completed | 32/84 [00:09<00:14,  3.59it/s]
Loading safetensors checkpoint shards:  39% Completed | 33/84 [00:09<00:14,  3.57it/s]
Loading safetensors checkpoint shards:  40% Completed | 34/84 [00:09<00:14,  3.51it/s]
Loading safetensors checkpoint shards:  42% Completed | 35/84 [00:10<00:14,  3.40it/s]
Loading safetensors checkpoint shards:  43% Completed | 36/84 [00:10<00:14,  3.42it/s]
Loading safetensors checkpoint shards:  44% Completed | 37/84 [00:10<00:13,  3.57it/s]
Loading safetensors checkpoint shards:  45% Completed | 38/84 [00:10<00:13,  3.51it/s]
Loading safetensors checkpoint shards:  46% Completed | 39/84 [00:11<00:12,  3.61it/s]
Loading safetensors checkpoint shards:  48% Completed | 40/84 [00:11<00:11,  3.68it/s]
Loading safetensors checkpoint shards:  49% Completed | 41/84 [00:11<00:12,  3.51it/s]
Loading safetensors checkpoint shards:  50% Completed | 42/84 [00:12<00:12,  3.38it/s]
Loading safetensors checkpoint shards:  51% Completed | 43/84 [00:12<00:12,  3.37it/s]
Loading safetensors checkpoint shards:  52% Completed | 44/84 [00:12<00:14,  2.69it/s]
Loading safetensors checkpoint shards:  54% Completed | 45/84 [00:13<00:14,  2.76it/s]
Loading safetensors checkpoint shards:  55% Completed | 46/84 [00:13<00:14,  2.71it/s]
Loading safetensors checkpoint shards:  56% Completed | 47/84 [00:13<00:13,  2.80it/s]
Loading safetensors checkpoint shards:  57% Completed | 48/84 [00:14<00:12,  2.95it/s]
Loading safetensors checkpoint shards:  58% Completed | 49/84 [00:14<00:11,  2.92it/s]
Loading safetensors checkpoint shards:  60% Completed | 50/84 [00:14<00:11,  2.93it/s]
Loading safetensors checkpoint shards:  61% Completed | 51/84 [00:15<00:10,  3.06it/s]
Loading safetensors checkpoint shards:  62% Completed | 52/84 [00:15<00:10,  2.97it/s]
Loading safetensors checkpoint shards:  63% Completed | 53/84 [00:15<00:09,  3.11it/s]
Loading safetensors checkpoint shards:  64% Completed | 54/84 [00:16<00:09,  3.07it/s]
Loading safetensors checkpoint shards:  65% Completed | 55/84 [00:16<00:08,  3.29it/s]
Loading safetensors checkpoint shards:  67% Completed | 56/84 [00:16<00:08,  3.31it/s]
Loading safetensors checkpoint shards:  68% Completed | 57/84 [00:17<00:08,  3.37it/s]
Loading safetensors checkpoint shards:  69% Completed | 58/84 [00:17<00:07,  3.49it/s]
Loading safetensors checkpoint shards:  70% Completed | 59/84 [00:17<00:06,  3.59it/s]
Loading safetensors checkpoint shards:  71% Completed | 60/84 [00:17<00:06,  3.46it/s]
Loading safetensors checkpoint shards:  73% Completed | 61/84 [00:18<00:06,  3.48it/s]
Loading safetensors checkpoint shards:  74% Completed | 62/84 [00:18<00:06,  3.45it/s]
Loading safetensors checkpoint shards:  76% Completed | 64/84 [00:18<00:04,  4.14it/s]
Loading safetensors checkpoint shards:  77% Completed | 65/84 [00:19<00:04,  4.05it/s]
Loading safetensors checkpoint shards:  79% Completed | 66/84 [00:19<00:05,  3.54it/s]
Loading safetensors checkpoint shards:  80% Completed | 67/84 [00:19<00:05,  3.40it/s]
Loading safetensors checkpoint shards:  81% Completed | 68/84 [00:20<00:04,  3.50it/s]
Loading safetensors checkpoint shards:  82% Completed | 69/84 [00:20<00:04,  3.43it/s]
Loading safetensors checkpoint shards:  83% Completed | 70/84 [00:20<00:04,  3.28it/s]
Loading safetensors checkpoint shards:  85% Completed | 71/84 [00:21<00:03,  3.42it/s]
Loading safetensors checkpoint shards:  86% Completed | 72/84 [00:21<00:03,  3.36it/s]
Loading safetensors checkpoint shards:  87% Completed | 73/84 [00:21<00:03,  3.47it/s]
Loading safetensors checkpoint shards:  88% Completed | 74/84 [00:21<00:02,  3.53it/s]
Loading safetensors checkpoint shards:  89% Completed | 75/84 [00:22<00:02,  3.39it/s]
Loading safetensors checkpoint shards:  90% Completed | 76/84 [00:22<00:02,  3.52it/s]
Loading safetensors checkpoint shards:  92% Completed | 77/84 [00:22<00:01,  3.59it/s]
Loading safetensors checkpoint shards:  93% Completed | 78/84 [00:22<00:01,  3.67it/s]
Loading safetensors checkpoint shards:  94% Completed | 79/84 [00:23<00:01,  3.58it/s]
Loading safetensors checkpoint shards:  95% Completed | 80/84 [00:23<00:01,  3.63it/s]
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Loading safetensors checkpoint shards:  96% Completed | 81/84 [00:23<00:00,  3.66it/s]
Loading safetensors checkpoint shards:  98% Completed | 82/84 [00:24<00:00,  3.76it/s]
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Loading safetensors checkpoint shards:  99% Completed | 83/84 [00:24<00:00,  3.62it/s]
Loading safetensors checkpoint shards: 100% Completed | 84/84 [00:24<00:00,  3.49it/s]
Loading safetensors checkpoint shards: 100% Completed | 84/84 [00:24<00:00,  3.41it/s]

Setting sliding_window_size to be attention_chunk_size: 8192
[2025-07-28 18:37:28 TP0] Load weight end. type=Llama4ForConditionalGeneration, dtype=torch.bfloat16, avail mem=26.97 GB, mem usage=50.72 GB.
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
[2025-07-28 18:37:29 TP7] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP5] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP3] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP2] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP4] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP0] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP1] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP0] Memory pool end. avail mem=7.57 GB
[2025-07-28 18:37:29 TP6] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:37:29 TP0] Capture cuda graph begin. This can take up to several minutes. avail mem=7.48 GB
[2025-07-28 18:37:29 TP0] Capture cuda graph bs [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160]
Capturing batches (bs=160 avail_mem=6.23 GB):   0%|                                        | 0/23 [00:00<?, ?it/s][2025-07-28 18:37:30 TP2] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP0] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP4] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP3] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP7] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP1] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP5] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:37:30 TP6] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
Capturing batches (bs=1 avail_mem=1.30 GB): 100%|█████████████████████████████████| 23/23 [00:16<00:00,  1.43it/s]
[2025-07-28 18:37:45 TP7] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP1] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP0] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP3] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP4] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP6] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP5] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP2] Registering 2231 cuda graph addresses
[2025-07-28 18:37:45 TP0] Capture cuda graph end. Time elapsed: 16.53 s. mem usage=6.19 GB. avail mem=1.29 GB.
[2025-07-28 18:37:47 TP0] max_total_num_tokens=667335, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=2048, context_len=524288, available_gpu_mem=1.29 GB
[2025-07-28 18:37:48] INFO:     Started server process [88212]
[2025-07-28 18:37:48] INFO:     Waiting for application startup.
[2025-07-28 18:37:48] INFO:     Application startup complete.
[2025-07-28 18:37:48] INFO:     Uvicorn running on http://127.0.0.1:8080 (Press CTRL+C to quit)
[2025-07-28 18:37:49] INFO:     127.0.0.1:49292 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-07-28 18:37:49 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0, 
[2025-07-28 18:37:50] INFO:     127.0.0.1:49296 - "POST /generate HTTP/1.1" 200 OK
[2025-07-28 18:37:50] The server is fired up and ready to roll!
[2025-07-28 18:37:56 TP0] Prefill batch. #new-seq: 1, #new-token: 8192, #cached-token: 1, token usage: 0.00, #running-req: 0, #queue-req: 0, 
[2025-07-28 18:37:56 TP0] TpModelWorkerClient hit an exception: Traceback (most recent call last):
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 140, in forward_thread_func
    self.forward_thread_func_()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 175, in forward_thread_func_
    self.worker.forward_batch_generation(
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 229, in forward_batch_generation
    logits_output, can_run_cuda_graph = self.model_runner.forward(
                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1552, in forward
    output = self._forward_raw(
             ^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1597, in _forward_raw
    ret = self.forward_extend(
          ^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1497, in forward_extend
    return self.model.forward(
           ^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/models/mllama4.py", line 177, in forward
    hs = general_mm_embed_routine(
         ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 650, in general_mm_embed_routine
    inputs_embeds = embed_mm_inputs(
                    ^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 573, in embed_mm_inputs
    embedding, mask = get_embedding_and_mask(
                      ^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 484, in get_embedding_and_mask
    embedding = _get_chunked_prefill_embedding(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 388, in _get_chunked_prefill_embedding
    embedding_per_req = data_embedding_func(embedding_items_per_req)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/models/mllama4.py", line 160, in get_image_feature
    image_outputs = self.vision_model(pixel_values, output_hidden_states=False)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/modeling_llama4.py", line 1180, in forward
    output = self.model(
             ^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/modeling_llama4.py", line 1016, in forward
    layer_outputs = encoder_layer(
                    ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py", line 83, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/modeling_llama4.py", line 935, in forward
    hidden_state, attn_weights = self.self_attn(
                                 ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/modeling_llama4.py", line 865, in forward
    query_states = self.q_proj(hidden_states).view(hidden_shape)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/linear.py", line 125, in forward
    return F.linear(input, self.weight, self.bias)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 124.00 MiB. GPU 0 has a total capacity of 79.44 GiB of which 82.69 MiB is free. Process 3324125 has 79.35 GiB memory in use. Of the allocated memory 76.19 GiB is allocated by PyTorch, with 30.38 MiB allocated in private pools (e.g., CUDA Graphs), and 277.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)