@@ -1743,7 +1743,8 @@ struct clip_model_loader {
17431743
17441744 if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
17451745 || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1746- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) {
1746+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1747+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
17471748 n_layer += 1 ;
17481749 }
17491750
@@ -2856,7 +2857,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
28562857 }
28572858 return true ;
28582859 }
2859- else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
2860+ else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL ) {
28602861 clip_image_u8 resized;
28612862 auto patch_size = clip_get_patch_size (ctx) * 2 ;
28622863 int nx = ceil ((float )img->nx / patch_size) * patch_size;
@@ -3255,7 +3256,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32553256 else {
32563257 // non-minicpmv models
32573258
3258- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
3259+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL ) {
32593260 // pw * ph = number of tokens output by ViT after apply patch merger
32603261 // ipw * ipw = number of vision token been processed inside ViT
32613262 const int merge_ratio = 2 ;
@@ -3395,7 +3396,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33953396 }
33963397 }
33973398
3398- if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3399+ if (use_window_attn && ( ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL) ) {
33993400 struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
34003401 struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
34013402 struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
0 commit comments