root@moirai-exp-mark-1:/sgl-workspace/sglang#  cd /sgl-workspace/sglang ; /usr/bin/env /bin/python3 /root/.vscode-server/extensions/ms-python.debugpy-2025.10.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher 55543 -- -m sglang.launch_server --model /models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 --port=8080 --tp-size=8 --mem-fraction-static=0.85 --context-length=524288 --tool-call-parser=pythonic --chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja --attention-backend=fa3 --enable-multimodal 
[2025-07-28 18:43:45] server_args=ServerArgs(model_path='/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', tokenizer_path='/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=524288, is_embedding=False, enable_multimodal=True, revision=None, model_impl='auto', host='127.0.0.1', port=8080, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.85, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, hybrid_kvcache_ratio=None, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, device='cuda', tp_size=8, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=649496823, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, served_model_name='/models/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', chat_template='examples/chat_template/tool_chat_template_llama4_pythonic.jinja', completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser='pythonic', dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='fa3', sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_cutlass_moe=False, enable_flashinfer_trtllm_moe=False, enable_flashinfer_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', hicache_io_backend='', hicache_storage_backend=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, enable_triton_kernel_moe=False, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False, enable_pdmux=False, sm_group_num=3)
[2025-07-28 18:43:48] Loading chat template: examples/chat_template/tool_chat_template_llama4_pythonic.jinja
[2025-07-28 18:43:48] Detected chat template content format: openai
[2025-07-28 18:44:16 TP0] Init torch distributed begin.
[2025-07-28 18:44:21 TP0] sglang is using nccl==2.26.2
[2025-07-28 18:44:22 TP0] Init torch distributed ends. mem usage=1.23 GB
[2025-07-28 18:44:23 TP0] Load weight begin. avail mem=77.69 GB
Loading safetensors checkpoint shards:   0% Completed | 0/84 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   1% Completed | 1/84 [00:00<00:27,  3.02it/s]
Loading safetensors checkpoint shards:   2% Completed | 2/84 [00:00<00:24,  3.36it/s]
Loading safetensors checkpoint shards:   4% Completed | 3/84 [00:00<00:22,  3.54it/s]
Loading safetensors checkpoint shards:   5% Completed | 4/84 [00:01<00:22,  3.54it/s]
Loading safetensors checkpoint shards:   6% Completed | 5/84 [00:01<00:24,  3.27it/s]
Loading safetensors checkpoint shards:   7% Completed | 6/84 [00:01<00:23,  3.39it/s]
Loading safetensors checkpoint shards:   8% Completed | 7/84 [00:02<00:21,  3.53it/s]
Loading safetensors checkpoint shards:  10% Completed | 8/84 [00:02<00:22,  3.41it/s]
Loading safetensors checkpoint shards:  11% Completed | 9/84 [00:02<00:21,  3.48it/s]
Loading safetensors checkpoint shards:  12% Completed | 10/84 [00:02<00:20,  3.57it/s]
Loading safetensors checkpoint shards:  13% Completed | 11/84 [00:03<00:20,  3.64it/s]
Loading safetensors checkpoint shards:  14% Completed | 12/84 [00:03<00:20,  3.53it/s]
Loading safetensors checkpoint shards:  15% Completed | 13/84 [00:03<00:21,  3.35it/s]
Loading safetensors checkpoint shards:  17% Completed | 14/84 [00:04<00:20,  3.44it/s]
Loading safetensors checkpoint shards:  18% Completed | 15/84 [00:04<00:19,  3.52it/s]
Loading safetensors checkpoint shards:  19% Completed | 16/84 [00:04<00:19,  3.40it/s]
Loading safetensors checkpoint shards:  20% Completed | 17/84 [00:04<00:20,  3.32it/s]
Loading safetensors checkpoint shards:  21% Completed | 18/84 [00:05<00:19,  3.43it/s]
Loading safetensors checkpoint shards:  23% Completed | 19/84 [00:05<00:18,  3.47it/s]
Loading safetensors checkpoint shards:  24% Completed | 20/84 [00:05<00:18,  3.50it/s]
Loading safetensors checkpoint shards:  25% Completed | 21/84 [00:06<00:18,  3.42it/s]
Loading safetensors checkpoint shards:  26% Completed | 22/84 [00:06<00:18,  3.34it/s]
Loading safetensors checkpoint shards:  27% Completed | 23/84 [00:06<00:17,  3.43it/s]
Loading safetensors checkpoint shards:  29% Completed | 24/84 [00:06<00:17,  3.51it/s]
Loading safetensors checkpoint shards:  30% Completed | 25/84 [00:07<00:16,  3.58it/s]
Loading safetensors checkpoint shards:  31% Completed | 26/84 [00:07<00:16,  3.61it/s]
Loading safetensors checkpoint shards:  32% Completed | 27/84 [00:07<00:15,  3.65it/s]
Loading safetensors checkpoint shards:  33% Completed | 28/84 [00:08<00:15,  3.67it/s]
Loading safetensors checkpoint shards:  35% Completed | 29/84 [00:08<00:14,  3.67it/s]
Loading safetensors checkpoint shards:  36% Completed | 30/84 [00:08<00:14,  3.68it/s]
Loading safetensors checkpoint shards:  37% Completed | 31/84 [00:08<00:14,  3.68it/s]
Loading safetensors checkpoint shards:  38% Completed | 32/84 [00:09<00:14,  3.55it/s]
Loading safetensors checkpoint shards:  39% Completed | 33/84 [00:09<00:14,  3.57it/s]
Loading safetensors checkpoint shards:  40% Completed | 34/84 [00:09<00:13,  3.61it/s]
Loading safetensors checkpoint shards:  42% Completed | 35/84 [00:09<00:13,  3.56it/s]
Loading safetensors checkpoint shards:  43% Completed | 36/84 [00:10<00:13,  3.54it/s]
Loading safetensors checkpoint shards:  44% Completed | 37/84 [00:10<00:12,  3.63it/s]
Loading safetensors checkpoint shards:  45% Completed | 38/84 [00:10<00:12,  3.56it/s]
Loading safetensors checkpoint shards:  46% Completed | 39/84 [00:11<00:12,  3.64it/s]
Loading safetensors checkpoint shards:  48% Completed | 40/84 [00:11<00:11,  3.69it/s]
Loading safetensors checkpoint shards:  49% Completed | 41/84 [00:11<00:12,  3.53it/s]
Loading safetensors checkpoint shards:  50% Completed | 42/84 [00:11<00:11,  3.58it/s]
Loading safetensors checkpoint shards:  51% Completed | 43/84 [00:12<00:11,  3.62it/s]
Loading safetensors checkpoint shards:  52% Completed | 44/84 [00:12<00:13,  2.90it/s]
Loading safetensors checkpoint shards:  54% Completed | 45/84 [00:12<00:12,  3.10it/s]
Loading safetensors checkpoint shards:  55% Completed | 46/84 [00:13<00:12,  3.13it/s]
Loading safetensors checkpoint shards:  56% Completed | 47/84 [00:13<00:11,  3.27it/s]
Loading safetensors checkpoint shards:  57% Completed | 48/84 [00:13<00:10,  3.38it/s]
Loading safetensors checkpoint shards:  58% Completed | 49/84 [00:14<00:10,  3.46it/s]
Loading safetensors checkpoint shards:  60% Completed | 50/84 [00:14<00:09,  3.55it/s]
Loading safetensors checkpoint shards:  61% Completed | 51/84 [00:14<00:09,  3.64it/s]
Loading safetensors checkpoint shards:  62% Completed | 52/84 [00:14<00:08,  3.68it/s]
Loading safetensors checkpoint shards:  63% Completed | 53/84 [00:15<00:08,  3.75it/s]
Loading safetensors checkpoint shards:  64% Completed | 54/84 [00:15<00:08,  3.65it/s]
Loading safetensors checkpoint shards:  65% Completed | 55/84 [00:15<00:07,  3.73it/s]
Loading safetensors checkpoint shards:  67% Completed | 56/84 [00:15<00:07,  3.59it/s]
Loading safetensors checkpoint shards:  68% Completed | 57/84 [00:16<00:07,  3.64it/s]
Loading safetensors checkpoint shards:  69% Completed | 58/84 [00:16<00:07,  3.71it/s]
Loading safetensors checkpoint shards:  70% Completed | 59/84 [00:16<00:06,  3.74it/s]
Loading safetensors checkpoint shards:  71% Completed | 60/84 [00:17<00:06,  3.55it/s]
Loading safetensors checkpoint shards:  73% Completed | 61/84 [00:17<00:06,  3.46it/s]
Loading safetensors checkpoint shards:  74% Completed | 62/84 [00:17<00:06,  3.56it/s]
Loading safetensors checkpoint shards:  76% Completed | 64/84 [00:18<00:04,  4.22it/s]
Loading safetensors checkpoint shards:  77% Completed | 65/84 [00:18<00:04,  4.10it/s]
Loading safetensors checkpoint shards:  79% Completed | 66/84 [00:18<00:04,  3.84it/s]
Loading safetensors checkpoint shards:  80% Completed | 67/84 [00:18<00:04,  3.66it/s]
Loading safetensors checkpoint shards:  81% Completed | 68/84 [00:19<00:04,  3.71it/s]
Loading safetensors checkpoint shards:  82% Completed | 69/84 [00:19<00:03,  3.75it/s]
Loading safetensors checkpoint shards:  83% Completed | 70/84 [00:19<00:03,  3.61it/s]
Loading safetensors checkpoint shards:  85% Completed | 71/84 [00:19<00:03,  3.70it/s]
Loading safetensors checkpoint shards:  86% Completed | 72/84 [00:20<00:03,  3.56it/s]
Loading safetensors checkpoint shards:  87% Completed | 73/84 [00:20<00:03,  3.61it/s]
Loading safetensors checkpoint shards:  88% Completed | 74/84 [00:20<00:02,  3.64it/s]
Loading safetensors checkpoint shards:  89% Completed | 75/84 [00:21<00:02,  3.46it/s]
Loading safetensors checkpoint shards:  90% Completed | 76/84 [00:21<00:02,  3.38it/s]
Loading safetensors checkpoint shards:  92% Completed | 77/84 [00:21<00:01,  3.50it/s]
Loading safetensors checkpoint shards:  93% Completed | 78/84 [00:21<00:01,  3.58it/s]
Loading safetensors checkpoint shards:  94% Completed | 79/84 [00:22<00:01,  3.42it/s]
Loading safetensors checkpoint shards:  95% Completed | 80/84 [00:22<00:01,  3.51it/s]
Loading safetensors checkpoint shards:  96% Completed | 81/84 [00:22<00:00,  3.61it/s]
Loading safetensors checkpoint shards:  98% Completed | 82/84 [00:23<00:00,  3.72it/s]
Loading safetensors checkpoint shards:  99% Completed | 83/84 [00:23<00:00,  3.58it/s]
Loading safetensors checkpoint shards: 100% Completed | 84/84 [00:23<00:00,  3.45it/s]
Loading safetensors checkpoint shards: 100% Completed | 84/84 [00:23<00:00,  3.54it/s]

Setting sliding_window_size to be attention_chunk_size: 8192
[2025-07-28 18:44:47 TP0] Load weight end. type=Llama4ForConditionalGeneration, dtype=torch.bfloat16, avail mem=26.97 GB, mem usage=50.72 GB.
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
Setting sliding_window_size to be attention_chunk_size: 8192
[2025-07-28 18:44:50 TP3] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP6] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP5] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP2] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP7] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP1] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP4] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP0] KV Cache is allocated. #tokens: 667335, K size: 7.64 GB, V size: 7.64 GB
[2025-07-28 18:44:50 TP0] Memory pool end. avail mem=7.57 GB
[2025-07-28 18:44:50 TP0] Capture cuda graph begin. This can take up to several minutes. avail mem=7.48 GB
[2025-07-28 18:44:50 TP0] Capture cuda graph bs [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160]
Capturing batches (bs=160 avail_mem=6.23 GB):   0%|                                        | 0/23 [00:00<?, ?it/s][2025-07-28 18:44:51 TP7] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP2] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP0] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP5] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP1] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP6] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP4] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
[2025-07-28 18:44:51 TP3] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
Capturing batches (bs=1 avail_mem=1.30 GB): 100%|█████████████████████████████████| 23/23 [00:13<00:00,  1.68it/s]
[2025-07-28 18:45:04 TP3] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP2] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP5] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP6] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP0] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP7] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP1] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP4] Registering 2231 cuda graph addresses
[2025-07-28 18:45:04 TP0] Capture cuda graph end. Time elapsed: 14.07 s. mem usage=6.19 GB. avail mem=1.29 GB.
[2025-07-28 18:45:06 TP0] max_total_num_tokens=667335, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=2048, context_len=524288, available_gpu_mem=1.29 GB
[2025-07-28 18:45:07] INFO:     Started server process [100364]
[2025-07-28 18:45:07] INFO:     Waiting for application startup.
[2025-07-28 18:45:07] INFO:     Application startup complete.
[2025-07-28 18:45:07] INFO:     Uvicorn running on http://127.0.0.1:8080 (Press CTRL+C to quit)
[2025-07-28 18:45:08] INFO:     127.0.0.1:40660 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-07-28 18:45:08 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0, 
[2025-07-28 18:45:09] INFO:     127.0.0.1:40666 - "POST /generate HTTP/1.1" 200 OK
[2025-07-28 18:45:09] The server is fired up and ready to roll!
[2025-07-28 18:45:14] Error in request: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Traceback (most recent call last):
  File "/sgl-workspace/sglang/python/sglang/srt/entrypoints/openai/serving_base.py", line 45, in handle_request
    return await self._handle_non_streaming_request(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/entrypoints/openai/serving_chat.py", line 625, in _handle_non_streaming_request
    ret = await self.tokenizer_manager.generate_request(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 478, in generate_request
    tokenized_obj = await self._tokenize_one_request(obj)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 532, in _tokenize_one_request
    mm_inputs: Dict = await self.mm_processor.process_mm_data_async(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/mllama4.py", line 47, in process_mm_data_async
    mm_items, input_ids, _ = self.process_and_combine_mm_data(
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 597, in process_and_combine_mm_data
    collected_items, input_ids, ret = self._process_and_collect_mm_items(
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 540, in _process_and_collect_mm_items
    ret = self.process_mm_data(
          ^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 218, in process_mm_data
    result = processor.__call__(
             ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/processing_llama4.py", line 192, in __call__
    image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 614, in __call__
    return self.preprocess(images, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/image_processing_llama4_fast.py", line 382, in preprocess
    return super().preprocess(images, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 630, in preprocess
    images = self._prepare_input_images(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 539, in _prepare_input_images
    processed_images = [process_image_partial(img) for img in images]
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 498, in _process_image
    image = image.to(device)
            ^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[2025-07-28 18:45:14] INFO:     127.0.0.1:47160 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
[2025-07-28 18:45:15] Error in request: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Traceback (most recent call last):
  File "/sgl-workspace/sglang/python/sglang/srt/entrypoints/openai/serving_base.py", line 45, in handle_request
    return await self._handle_non_streaming_request(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/entrypoints/openai/serving_chat.py", line 625, in _handle_non_streaming_request
    ret = await self.tokenizer_manager.generate_request(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 478, in generate_request
    tokenized_obj = await self._tokenize_one_request(obj)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 532, in _tokenize_one_request
    mm_inputs: Dict = await self.mm_processor.process_mm_data_async(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/mllama4.py", line 47, in process_mm_data_async
    mm_items, input_ids, _ = self.process_and_combine_mm_data(
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 597, in process_and_combine_mm_data
    collected_items, input_ids, ret = self._process_and_collect_mm_items(
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 540, in _process_and_collect_mm_items
    ret = self.process_mm_data(
          ^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 218, in process_mm_data
    result = processor.__call__(
             ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/processing_llama4.py", line 192, in __call__
    image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 614, in __call__
    return self.preprocess(images, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/image_processing_llama4_fast.py", line 382, in preprocess
    return super().preprocess(images, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 630, in preprocess
    images = self._prepare_input_images(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 539, in _prepare_input_images
    processed_images = [process_image_partial(img) for img in images]
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 498, in _process_image
    image = image.to(device)
            ^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

[2025-07-28 18:45:15] INFO:     127.0.0.1:47160 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
[2025-07-28 18:45:17] Error in request: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Traceback (most recent call last):
  File "/sgl-workspace/sglang/python/sglang/srt/entrypoints/openai/serving_base.py", line 45, in handle_request
    return await self._handle_non_streaming_request(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/entrypoints/openai/serving_chat.py", line 625, in _handle_non_streaming_request
    ret = await self.tokenizer_manager.generate_request(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 478, in generate_request
    tokenized_obj = await self._tokenize_one_request(obj)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/managers/tokenizer_manager.py", line 532, in _tokenize_one_request
    mm_inputs: Dict = await self.mm_processor.process_mm_data_async(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/mllama4.py", line 47, in process_mm_data_async
    mm_items, input_ids, _ = self.process_and_combine_mm_data(
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 597, in process_and_combine_mm_data
    collected_items, input_ids, ret = self._process_and_collect_mm_items(
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 540, in _process_and_collect_mm_items
    ret = self.process_mm_data(
          ^^^^^^^^^^^^^^^^^^^^^
  File "/sgl-workspace/sglang/python/sglang/srt/multimodal/processors/base_processor.py", line 218, in process_mm_data
    result = processor.__call__(
             ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/processing_llama4.py", line 192, in __call__
    image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 614, in __call__
    return self.preprocess(images, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/llama4/image_processing_llama4_fast.py", line 382, in preprocess
    return super().preprocess(images, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 630, in preprocess
    images = self._prepare_input_images(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 539, in _prepare_input_images
    processed_images = [process_image_partial(img) for img in images]
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/image_processing_utils_fast.py", line 498, in _process_image
    image = image.to(device)
            ^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.