|
# Currently, we don't allow prefill instance and decode instance to |
|
# have different TP sizes per DP rank, except for models using MLA. |
|
if self.kv_mgr.attn_tp_size == self.prefill_attn_tp_size: |
|
self.target_tp_rank = ( |
|
self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size |
|
) |
|
self.required_dst_info_num = 1 |
|
self.required_prefill_response_num = 1 * ( |
|
self.prefill_pp_size // self.kv_mgr.pp_size |
|
) |
|
self.target_tp_ranks = [self.target_tp_rank] |
|
elif self.kv_mgr.attn_tp_size > self.prefill_attn_tp_size: |
|
if not self.kv_mgr.is_mla_backend: |
|
logger.warning_once( |
|
"Performance is NOT guaranteed when using different TP sizes for non-MLA models. " |
|
) |
|
self.target_tp_rank = ( |
|
self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size |
|
) // (self.kv_mgr.attn_tp_size // self.prefill_attn_tp_size) |
|
self.required_dst_info_num = ( |
|
self.kv_mgr.attn_tp_size // self.prefill_attn_tp_size |
|
) |
|
self.required_prefill_response_num = 1 * ( |
|
self.prefill_pp_size // self.kv_mgr.pp_size |
|
) |
|
self.target_tp_ranks = [self.target_tp_rank] |
|
else: |
|
if not self.kv_mgr.is_mla_backend: |
|
logger.warning_once( |
|
"Performance is NOT guaranteed when using different TP sizes for non-MLA models. " |
|
) |
|
# For non-MLA models, one decode rank needs to retrieve KVCache from multiple prefill ranks for non MLA models; |
|
self.target_tp_ranks = [ |
|
rank |
|
for rank in range( |
|
(self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size) |
|
* (self.prefill_attn_tp_size // self.kv_mgr.attn_tp_size), |
|
(self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size + 1) |
|
* (self.prefill_attn_tp_size // self.kv_mgr.attn_tp_size), |
|
) |
|
] |
|
|
|
# For MLA models, we can retrieve KVCache from only one prefill rank, but we still need to maintain |
|
# multiple connections in the connection pool and have to send dummy requests to other prefill ranks, |
|
# or the KVPoll will never be set correctly |
|
self.target_tp_rank = self.target_tp_ranks[0] |
|
self.required_dst_info_num = 1 |
|
if self.kv_mgr.is_mla_backend: |
|
self.required_prefill_response_num = ( |
|
self.prefill_pp_size // self.kv_mgr.pp_size |
|
) |
|
else: |
|
self.required_prefill_response_num = ( |
|
self.prefill_attn_tp_size // self.kv_mgr.attn_tp_size |
|
) * (self.prefill_pp_size // self.kv_mgr.pp_size) |
For now, two major KV managers (
mooncake,nixl) have too many duplicate codes. For example,sglang/python/sglang/srt/disaggregation/mooncake/conn.py
Lines 1267 to 1321 in 8717d6b
and
sglang/python/sglang/srt/disaggregation/common/conn.py
Lines 143 to 184 in 8717d6b
The core problem here is mooncake is not inherited from the common class (
CommonKVReceiver,CommonKVManager, etc.)