@@ -198,7 +198,7 @@ def __init__(
198198 num_heads = self .num_local_heads ,
199199 head_size = self .head_dim ,
200200 scale = self .scaling ,
201- multimodal_config = multimodal_config ,
201+ prefix = prefix ,
202202 )
203203
204204 def forward (
@@ -358,16 +358,9 @@ def __init__(
358358 self .act = _ACTIVATION_REGISTRY [config .activation_function ]
359359 self .proj2 = nn .Linear (config .d_model , config .output_dim )
360360
361- # Get attention backend
362- attn_backend_override = (
363- multimodal_config .mm_encoder_attn_backend
364- if multimodal_config is not None
365- else None
366- )
367361 self .attn_backend = get_vit_attn_backend (
368362 head_size = config .d_model // config .encoder_attention_heads ,
369363 dtype = torch .get_default_dtype (),
370- attn_backend_override = attn_backend_override ,
371364 )
372365
373366 def compute_attn_mask_seqlen (self , cu_seqlens : torch .Tensor ) -> torch .Tensor | None :
@@ -553,6 +546,12 @@ def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
553546 def get_supported_mm_limits (self ) -> Mapping [str , int | None ]:
554547 return {"audio" : None }
555548
549+ def get_data_parser (self ) -> MultiModalDataParser :
550+ feature_extractor = self .get_feature_extractor ()
551+ return Qwen3ASRMultiModalDataParser (
552+ target_sr = feature_extractor .sampling_rate ,
553+ )
554+
556555
557556class Qwen3ASRDummyInputsBuilder (BaseDummyInputsBuilder [Qwen3ASRProcessingInfo ]):
558557 def get_dummy_text (self , mm_counts : Mapping [str , int ]) -> str :
@@ -622,11 +621,7 @@ def _parse_audio_data(
622621class Qwen3ASRMultiModalProcessor (
623622 Qwen3OmniMoeThinkerMultiModalProcessor ,
624623):
625- def _get_data_parser (self ) -> MultiModalDataParser :
626- feature_extractor = self .info .get_feature_extractor ()
627- return Qwen3ASRMultiModalDataParser (
628- target_sr = feature_extractor .sampling_rate ,
629- )
624+ pass
630625
631626 def _get_mm_fields_config (
632627 self ,
0 commit comments