(sglang053rc1) root@7248a1e11049:/home# CUDA_VISIBLE_DEVICES=0,1,2,3 python -m sglang.launch_server --served-model-name GLM-Air --model-path /home/models/GLM-4.5-Air-FP8/ --context-length 65536 --tensor-parallel-size 4 --chunked-prefill-size 8192 --port 8011 --mem-fraction-static 0.7 --disable-radix-cache --attention-backend flex_attention WARNING:sglang.srt.server_args:Cuda graph is disabled because of using torch Flex Attention backend [2025-09-22 13:00:57] server_args=ServerArgs(model_path='/home/lyy/models/GLM-4.5-Air-FP8/', tokenizer_path='/home/lyy/models/GLM-4.5-Air-FP8/', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=65536, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='127.0.0.1', port=8011, skip_server_warmup=False, warmups=None, nccl_port=None, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', mem_fraction_static=0.7, max_running_requests=None, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', enable_priority_scheduling=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=1, hybrid_kvcache_ratio=None, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', device='cuda', tp_size=4, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=831482852, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-customer-labels', tokenizer_metrics_allowed_customer_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets=None, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, gc_warning_threshold_secs=0.0, enable_trace=False, oltp_traces_endpoint='localhost:4317', api_key=None, served_model_name='GLM-Air', weight_version='default', chat_template=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, tool_server=None, dp_size=1, load_balance_method='round_robin', load_watch_interval=0.1, prefill_round_robin_balance=False, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_backend='triton', max_lora_chunk_size=16, attention_backend='flex_attention', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_draft_model_revision=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_lookahead_min_match_window_size=1, speculative_lookahead_max_match_window_size=12, speculative_lookahead_min_bfs_breadth=1, speculative_lookahead_max_bfs_breadth=10, speculative_lookahead_match_type='BFS', speculative_lookahead_branch_length=18, speculative_lookahead_capacity=10000000, ep_size=1, moe_a2a_backend='none', moe_runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, enable_lmcache=False, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', disable_radix_cache=True, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, scheduler_recv_interval=1, numa_node=None, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, enable_pdmux=False, sm_group_num=3, max_mamba_cache_size=None, mamba_ssm_dtype='float32', enable_deterministic_inference=False, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_cutlass_moe=False, enable_flashinfer_cutedsl_moe=False, enable_flashinfer_trtllm_moe=False, enable_triton_kernel_moe=False, enable_flashinfer_mxfp4_moe=False) `torch_dtype` is deprecated! Use `dtype` instead! [2025-09-22 13:00:57] Using default HuggingFace chat template with detected content format: openai `torch_dtype` is deprecated! Use `dtype` instead! `torch_dtype` is deprecated! Use `dtype` instead! `torch_dtype` is deprecated! Use `dtype` instead! `torch_dtype` is deprecated! Use `dtype` instead! [2025-09-22 13:01:04 TP0] Init torch distributed begin. [rank3]:[W922 13:01:04.244569113 ProcessGroupGloo.cpp:514] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator()) [rank1]:[W922 13:01:05.338085651 ProcessGroupGloo.cpp:514] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator()) [rank2]:[W922 13:01:05.442727188 ProcessGroupGloo.cpp:514] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator()) [rank0]:[W922 13:01:05.450427066 ProcessGroupGloo.cpp:514] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator()) [Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [2025-09-22 13:01:05 TP0] sglang is using nccl==2.27.3 [2025-09-22 13:01:05 TP3] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly. [2025-09-22 13:01:05 TP2] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly. [2025-09-22 13:01:05 TP0] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly. [2025-09-22 13:01:05 TP1] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 [Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 [2025-09-22 13:01:05 TP0] Init torch distributed ends. mem usage=0.16 GB [2025-09-22 13:01:06 TP0] Load weight begin. avail mem=46.76 GB Loading safetensors checkpoint shards: 0% Completed | 0/47 [00:00(), graph=None) args[4]: (TensorBox(StorageBox( InputBuffer(name='arg2_1', layout=FixedLayout('cuda:0', torch.int64, size=[], stride=[])) )), TensorBox(StorageBox( InputBuffer(name='arg2_1', layout=FixedLayout('cuda:0', torch.int64, size=[], stride=[])) )), TensorBox(StorageBox( InputBuffer(name='arg4_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1], stride=[1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg6_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1, 1], stride=[1, 1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg7_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1], stride=[1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg8_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1, 1], stride=[1, 1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg9_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1], stride=[1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg10_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1, 1], stride=[1, 1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg11_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1], stride=[1, 1, 1])) )), TensorBox(StorageBox( InputBuffer(name='arg12_1', layout=FixedLayout('cuda:0', torch.int32, size=[1, 1, 1, 1], stride=[1, 1, 1, 1])) )), s59, s30, Subgraph(name='sdpa_mask0', graph_module=(), graph=None)) args[5]: 0.08838834764831845 args[6]: {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': False} args[7]: () args[8]: () Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" [2025-09-22 13:01:26] Received sigquit from a child process. It usually means the child failed. Killed