huggingface · vasqu · Feb 9, 2026 · Feb 7, 2026 · Jan 31, 2026 · Feb 4, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -721,6 +721,10 @@
         title: Qwen2MoE
       - local: model_doc/qwen3
         title: Qwen3
+      - local: model_doc/qwen3_5
+        title: Qwen3.5
+      - local: model_doc/qwen3_5_moe
+        title: Qwen3.5 Moe
       - local: model_doc/qwen3_moe
         title: Qwen3MoE
       - local: model_doc/qwen3_next

diff --git a/docs/source/en/model_doc/qwen3_5.md b/docs/source/en/model_doc/qwen3_5.md
@@ -0,0 +1,76 @@
+<!--Copyright 2026 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-01-01 and added to Hugging Face Transformers on 2026-02-09.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
+</div>
+
+# Qwen3.5
+
+[Qwen3.5](https://huggingface.co/papers/2502.13923) TODO @shuaibai @bozheng
+
+Model usage
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
+TODO
+```
+
+</hfoption>
+</hfoptions>
+
+## Qwen3_5Config
+
+[[autodoc]] Qwen3_5Config
+
+## Qwen3_5TextConfig
+
+[[autodoc]] Qwen3_5TextConfig
+
+## Qwen3_5VisionModel
+
+[[autodoc]] Qwen3_5VisionModel
+    - forward
+
+## Qwen3_5TextModel
+
+[[autodoc]] Qwen3_5TextModel
+    - forward
+
+## Qwen3_5Model
+
+[[autodoc]] Qwen3_5Model
+    - forward
+
+## Qwen3_5ForCausalLM
+
+[[autodoc]] Qwen3_5ForCausalLM
+    - forward
+
+## Qwen3_5ForConditionalGeneration
+
+[[autodoc]] Qwen3_5ForConditionalGeneration
+    - forward
+
+## Qwen3_5Tokenizer
+
+[[autodoc]] Qwen3_5Tokenizer
diff --git a/docs/source/en/model_doc/qwen3_5_moe.md b/docs/source/en/model_doc/qwen3_5_moe.md
@@ -0,0 +1,72 @@
+<!--Copyright 2026 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-01-01 and added to Hugging Face Transformers on 2026-02-09.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
+</div>
+
+# Qwen3.5 Moe
+
+[Qwen3.5 Moe](https://huggingface.co/papers/2502.13923) TODO @shuaibai @bozheng
+
+Model usage
+
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
+TODO
+```
+
+</hfoption>
+</hfoptions>
+
+## Qwen3_5MoeConfig
+
+[[autodoc]] Qwen3_5MoeConfig
+
+## Qwen3_5MoeTextConfig
+
+[[autodoc]] Qwen3_5MoeTextConfig
+
+## Qwen3_5MoeVisionModel
+
+[[autodoc]] Qwen3_5MoeVisionModel
+    - forward
+
+## Qwen3_5MoeTextModel
+
+[[autodoc]] Qwen3_5MoeTextModel
+    - forward
+
+## Qwen3_5MoeModel
+
+[[autodoc]] Qwen3_5MoeModel
+    - forward
+
+## Qwen3_5MoeForCausalLM
+
+[[autodoc]] Qwen3_5MoeForCausalLM
+    - forward
+
+## Qwen3_5MoeForConditionalGeneration
+
+[[autodoc]] Qwen3_5MoeForConditionalGeneration
+    - forward
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -59,6 +59,7 @@
     "qwen3_omni_moe": "qwen2_moe",
     "qwen3_omni_moe_thinker": "qwen2_moe",
     "qwen3_next": "qwen2_moe",
+    "qwen3_5_moe": "qwen2_moe",
     "hunyuan_v1_moe": "qwen2_moe",
     "flex_olmo": "qwen2_moe",
     "olmoe": "qwen2_moe",
@@ -70,6 +71,9 @@
 
 def _build_checkpoint_conversion_mapping():
     mapping = {
+        "qwen3_5_text": [
+            WeightRenaming(source_patterns=r"^model.language_model", target_patterns="model"),
+        ],
         "t5gemma2": [
             WeightRenaming(r"(?<!vision_model\.)encoder.embed_tokens.", "encoder.text_model.embed_tokens."),
             WeightRenaming(r"(?<!vision_model\.)encoder.norm.", "encoder.text_model.norm."),
@@ -351,6 +355,9 @@ def _build_checkpoint_conversion_mapping():
     mapping["exaone_moe"] = mapping["qwen2_moe"].copy()
     mapping["exaone_moe"] += [WeightRenaming("mlp.e_score_correction_bias", "mlp.gate.e_score_correction_bias")]
 
+    mapping["qwen3_5_moe_text"] = mapping["qwen3_5_text"].copy()
+    mapping["qwen3_5_moe_text"] += mapping["qwen2_moe"].copy()
+
     for model_type, base_pattern in _MODEL_TO_CONVERSION_PATTERN.items():
         if model_type in mapping:
             continue

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -369,7 +369,7 @@ def prepare_fa_kwargs_from_position_ids(position_ids):
     """
     tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
 
-    position_ids = position_ids.view(-1)
+    position_ids = position_ids.reshape(-1)
     indices_q = (position_ids == 0).nonzero().view(-1)
 
     cu_seq_lens_q = torch.cat(

diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
@@ -645,7 +645,10 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: set | None
         partial_rotary_factor = kwargs.get("partial_rotary_factor", getattr(self, "partial_rotary_factor", None))
         if partial_rotary_factor is not None:
             self.rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor)
-            ignore_keys_at_rope_validation = {"partial_rotary_factor"}
+            ignore_keys_at_rope_validation = (
+                set() if ignore_keys_at_rope_validation is None else ignore_keys_at_rope_validation
+            )
+            ignore_keys_at_rope_validation = ignore_keys_at_rope_validation | {"partial_rotary_factor"}
 
         self.standardize_rope_params()
         self.validate_rope(ignore_keys=ignore_keys_at_rope_validation)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -321,6 +321,8 @@
     from .qwen2_moe import *
     from .qwen2_vl import *
     from .qwen3 import *
+    from .qwen3_5 import *
+    from .qwen3_5_moe import *
     from .qwen3_moe import *
     from .qwen3_next import *
     from .qwen3_omni_moe import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -361,6 +361,10 @@
         ("qwen2_vl", "Qwen2VLConfig"),
         ("qwen2_vl_text", "Qwen2VLTextConfig"),
         ("qwen3", "Qwen3Config"),
+        ("qwen3_5", "Qwen3_5Config"),
+        ("qwen3_5_moe", "Qwen3_5MoeConfig"),
+        ("qwen3_5_moe_text", "Qwen3_5MoeTextConfig"),
+        ("qwen3_5_text", "Qwen3_5TextConfig"),
         ("qwen3_moe", "Qwen3MoeConfig"),
         ("qwen3_next", "Qwen3NextConfig"),
         ("qwen3_omni_moe", "Qwen3OmniMoeConfig"),
@@ -850,6 +854,10 @@
         ("qwen2_vl", "Qwen2VL"),
         ("qwen2_vl_text", "Qwen2VL"),
         ("qwen3", "Qwen3"),
+        ("qwen3_5", "Qwen3_5"),
+        ("qwen3_5_moe", "Qwen3_5Moe"),
+        ("qwen3_5_moe_text", "Qwen3_5MoeText"),
+        ("qwen3_5_text", "Qwen3_5Text"),
         ("qwen3_moe", "Qwen3MoE"),
         ("qwen3_next", "Qwen3Next"),
         ("qwen3_omni_moe", "Qwen3OmniMoE"),
@@ -1042,6 +1050,8 @@
         ("qwen2_vl_text", "qwen2_vl"),
         ("qwen3_vl_text", "qwen3_vl"),
         ("qwen3_vl_moe_text", "qwen3_vl_moe"),
+        ("qwen3_5_text", "qwen3_5"),
+        ("qwen3_5_moe_text", "qwen3_5_moe"),
         ("sam_vision_model", "sam"),
         ("sam2_vision_model", "sam2"),
         ("sam2_hiera_det_model", "sam2"),

diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -177,6 +177,8 @@
             ("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
+            ("qwen3_5", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
+            ("qwen3_5_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
             ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -350,6 +350,10 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_vl", "Qwen2VLModel"),
         ("qwen2_vl_text", "Qwen2VLTextModel"),
         ("qwen3", "Qwen3Model"),
+        ("qwen3_5", "Qwen3_5Model"),
+        ("qwen3_5_moe", "Qwen3_5MoeModel"),
+        ("qwen3_5_moe_text", "Qwen3_5MoeTextModel"),
+        ("qwen3_5_text", "Qwen3_5TextModel"),
         ("qwen3_moe", "Qwen3MoeModel"),
         ("qwen3_next", "Qwen3NextModel"),
         ("qwen3_vl", "Qwen3VLModel"),
@@ -680,6 +684,10 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2", "Qwen2ForCausalLM"),
         ("qwen2_moe", "Qwen2MoeForCausalLM"),
         ("qwen3", "Qwen3ForCausalLM"),
+        ("qwen3_5", "Qwen3_5ForCausalLM"),  # VLM compatibility
+        ("qwen3_5_moe", "Qwen3_5MoeForCausalLM"),  # VLM compatibility
+        ("qwen3_5_moe_text", "Qwen3_5MoeForCausalLM"),
+        ("qwen3_5_text", "Qwen3_5ForCausalLM"),
         ("qwen3_moe", "Qwen3MoeForCausalLM"),
         ("qwen3_next", "Qwen3NextForCausalLM"),
         ("recurrent_gemma", "RecurrentGemmaForCausalLM"),
@@ -957,6 +965,8 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("pixtral", "LlavaForConditionalGeneration"),
         ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
         ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
+        ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
+        ("qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"),
         ("qwen3_vl", "Qwen3VLForConditionalGeneration"),
         ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"),
         ("shieldgemma2", "Gemma3ForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -132,6 +132,8 @@
         ("qwen2_5_vl", "Qwen2_5_VLProcessor"),
         ("qwen2_audio", "Qwen2AudioProcessor"),
         ("qwen2_vl", "Qwen2VLProcessor"),
+        ("qwen3_5", "Qwen3VLProcessor"),
+        ("qwen3_5_moe", "Qwen3VLProcessor"),
         ("qwen3_omni_moe", "Qwen3OmniMoeProcessor"),
         ("qwen3_vl", "Qwen3VLProcessor"),
         ("qwen3_vl_moe", "Qwen3VLProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -258,6 +258,8 @@
         ("qwen2_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
         ("qwen2_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
         ("qwen3", "Qwen2Tokenizer" if is_tokenizers_available() else None),
+        ("qwen3_5", "Qwen3_5Tokenizer" if is_tokenizers_available() else None),
+        ("qwen3_5_moe", "Qwen3_5Tokenizer" if is_tokenizers_available() else None),
         ("qwen3_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
         ("qwen3_next", "Qwen2Tokenizer" if is_tokenizers_available() else None),
         ("qwen3_omni_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),

diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
@@ -66,6 +66,8 @@
             ("qwen2_5_omni", "Qwen2VLVideoProcessor"),
             ("qwen2_5_vl", "Qwen2VLVideoProcessor"),
             ("qwen2_vl", "Qwen2VLVideoProcessor"),
+            ("qwen3_5", "Qwen3VLVideoProcessor"),
+            ("qwen3_5_moe", "Qwen3VLVideoProcessor"),
             ("qwen3_omni_moe", "Qwen2VLVideoProcessor"),
             ("qwen3_vl", "Qwen3VLVideoProcessor"),
             ("qwen3_vl_moe", "Qwen3VLVideoProcessor"),

diff --git a/src/transformers/models/qwen3_5/__init__.py b/src/transformers/models/qwen3_5/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qwen3_5 import *
+    from .modeling_qwen3_5 import *
+    from .tokenization_qwen3_5 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)