Skip to content

Eval bug: server/MTMD ignores jpeg EXIF orientation metadata #20870

@stduhpf

Description

@stduhpf

Name and Version

build: 8468 (3306dba) with MSVC 19.44.35209.0 for x64

Operating systems

Windows

GGML backends

Vulkan

Hardware

Ryzen 9 5900X, RX6800+RX5700XT

Models

Reproduced with Qwen3.5 35B, Qwen3.5 27B, Gemma 3 12B qat, most likely happens with every single one

Problem description & steps to reproduce

When asking a vision model about an image whose orientation is determined by EXIF Orientation metadata (typically pictures taken with a smartphone for example), the metadata is ignored, and the model sees the raw pixels without the corrected orientation.

Vibe-coded python code to create test images
#!/usr/bin/env python3
"""
Generate EXIF Orientation test images with manipulated pixel data.

All output images should look identical (upright) when EXIF is supported,
but will appear rotated/flipped when EXIF is ignored.

Usage: python generate_orientation_tests.py <input_image> <output_directory> --manipulate
"""

import os
import sys
import piexif
from PIL import Image

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def transform_pixels_for_orientation(img, orientation):
    """
    Transform pixel data so that the visual result is always upright.
    The EXIF tag tells the viewer how to interpret the pixels.
    
    We apply the INVERSE transformation of what the EXIF tag would apply.
    For mirrors: they are their own inverse, but order matters with rotation.
    """
    if orientation == 1:
        return img.copy()  # No transformation needed
    
    elif orientation == 2:
        # EXIF: Mirror horizontal
        return img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
    
    elif orientation == 3:
        # EXIF: Rotate 180°
        return img.rotate(180, expand=True)
    
    elif orientation == 4:
        # EXIF: Mirror vertical
        return img.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
    
    elif orientation == 5:
        # EXIF: Rotate 90° CW then Mirror horizontal
        # Inverse: Mirror horizontal then Rotate 270° CCW (90° CW in PIL)
        temp = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
        return temp.rotate(90, expand=True)  # 90 CCW = 270 CW inverse
    
    elif orientation == 6:
        # EXIF: Rotate 90° CW
        # Inverse: Rotate 270° CCW (90° CCW in PIL)
        return img.rotate(90, expand=True)
    
    elif orientation == 7:
        # EXIF: Rotate 270° CW then Mirror horizontal
        # Inverse: Mirror horizontal then Rotate 90° CCW (270° CCW in PIL)
        temp = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
        return temp.rotate(270, expand=True)  # 270 CCW = 90 CW inverse
    
    elif orientation == 8:
        # EXIF: Rotate 270° CW
        # Inverse: Rotate 90° CCW (270° CCW in PIL)
        return img.rotate(270, expand=True)
    
    else:
        # Orientation 9 is typically invalid, treat as 1
        return img.copy()

def set_orientation_tag(image_path, output_dir, target_orientation, manipulate_pixels=False):
    """
    Loads an image, optionally manipulates pixels, sets EXIF tag, saves.
    """
    img = Image.open(image_path)
    
    # Convert to RGB for JPEG compatibility
    if img.mode in ('RGBA', 'P', 'LA', 'CMYK'):
        img = img.convert('RGB')
    
    # Optionally manipulate pixel data
    if manipulate_pixels:
        img = transform_pixels_for_orientation(img, target_orientation)
    
    # Build EXIF structure
    exif_dict = {"0th": {}, "Exif": {}, "GPS": {}, "1st": {}, "thumbnail": None}
    exif_dict["0th"][piexif.ImageIFD.Orientation] = target_orientation
    exif_bytes = piexif.dump(exif_dict)
    
    # Generate output filename
    base_name = os.path.basename(image_path)
    name, ext = os.path.splitext(base_name)
    if manipulate_pixels:
        output_filename = os.path.join(output_dir, f"{name}_Ori{target_orientation}_Manp.jpg")
    else:
        output_filename = os.path.join(output_dir, f"{name}_Ori{target_orientation}.jpg")
    
    # Save as JPEG with EXIF
    try:
        img.save(output_filename, 'JPEG', exif=exif_bytes, quality=95)
        mode = "pixels manipulated" if manipulate_pixels else "tag only"
        print(f"Generated: {os.path.basename(output_filename)} (Tag: {target_orientation}) [{mode}]")
    except Exception as e:
        print(f"Error saving {output_filename}: {e}")

def main():
    import argparse
    
    parser = argparse.ArgumentParser(description='Generate EXIF orientation test images')
    parser.add_argument('input_image', help='Input image path')
    parser.add_argument('output_dir', help='Output directory')
    parser.add_argument('--manipulate', action='store_true', 
                        help='Manipulate pixel data according to orientation tag')
    parser.add_argument('--tags-only', action='store_true',
                        help='Only change EXIF tag, do not manipulate pixels (default)')
    
    args = parser.parse_args()
    
    if not os.path.exists(args.input_image):
        print(f"Error: Input file '{args.input_image}' not found.")
        sys.exit(1)
    
    ensure_dir(args.output_dir)
    
    print(f"Processing: {args.input_image}")
    print(f"Output Directory: {args.output_dir}\n")
    
    # Determine which mode to use
    if args.manipulate:
        mode = "MANIPULATE_PIXELS"
    elif args.tags_only:
        mode = "TAGS_ONLY"
    else:
        mode = "TAGS_ONLY"  # Default
    
    print(f"Mode: {mode}\n")
    print("Generating EXIF Orientation Tags (1-9)...\n")
    
    for orient in range(1, 10):
        try:
            set_orientation_tag(
                args.input_image, 
                args.output_dir, 
                orient,
                manipulate_pixels=args.manipulate
            )
        except Exception as e:
            print(f"Failed to process Orientation {orient}: {e}")
    
    print("\n" + "="*60)
    print("TESTING GUIDE:")
    print("="*60)
    if args.manipulate:
        print("✅ PIXELS MANIPULATED:")
        print("   - All images should look UPRIGHT (same visual)")
        print("   - EXIF tag tells viewer how to display")
        print("   - Test: Open in different apps - some may show rotated/flipped!")
    else:
        print("⚠️  TAGS ONLY (original pixels):")
        print("   - Tag indicates rotation, pixels stay upright")
        print("   - EXIF-supported apps will rotate them")
        print("   - Non-EXIF apps will show them upright (ignoring tag)")
    print("="*60)

if __name__ == "__main__":
    main()

To reproduce, you could generate test images with the code snippet above. It doesn't take long to realize the model doesn't see the images in the same orientation as they are displayed on most image viewers.

First Bad Commit

No response

Relevant log output

Logs
> /image .\orientation_results\lenna_Ori3_Manp.jpg
.\orientation_results\lenna_Ori3_Manp.jpg image loaded

> /image .\orientation_results\lenna_Ori1_Manp.jpg
.\orientation_results\lenna_Ori1_Manp.jpg image loaded

> Is there a difference between these two images?
encoding image slice...
image slice encoded in 4540 ms
decoding image batch 1/1, n_tokens_batch = 256
sched_reserve: reserving ...
sched_reserve:    Vulkan0 compute buffer size =   748.57 MiB
sched_reserve:    Vulkan1 compute buffer size =   811.58 MiB
sched_reserve: Vulkan_Host compute buffer size =   539.09 MiB
sched_reserve: graph nodes  = 1929
sched_reserve: graph splits = 3
sched_reserve: reserve took 726.60 ms, sched copies = 4
image decoded (batch 1/1) in 1731 ms
sched_reserve: reserving ...
sched_reserve:    Vulkan0 compute buffer size =   748.57 MiB
sched_reserve:    Vulkan1 compute buffer size =   811.58 MiB
sched_reserve: Vulkan_Host compute buffer size =   539.09 MiB
sched_reserve: graph nodes  = 1929
sched_reserve: graph splits = 3
sched_reserve: reserve took 700.99 ms, sched copies = 4
encoding image slice...
image slice encoded in 4111 ms
decoding image batch 1/1, n_tokens_batch = 256
sched_reserve: reserving ...
sched_reserve:    Vulkan0 compute buffer size =   748.57 MiB
sched_reserve:    Vulkan1 compute buffer size =   811.58 MiB
sched_reserve: Vulkan_Host compute buffer size =   539.09 MiB
sched_reserve: graph nodes  = 1929
sched_reserve: graph splits = 3
sched_reserve: reserve took 652.66 ms, sched copies = 4
image decoded (batch 1/1) in 1024 ms
sched_reserve: reserving ...
sched_reserve:    Vulkan0 compute buffer size =   748.57 MiB
sched_reserve:    Vulkan1 compute buffer size =   811.58 MiB
sched_reserve: Vulkan_Host compute buffer size =   539.09 MiB
sched_reserve: graph nodes  = 1929
sched_reserve: graph splits = 3
sched_reserve: reserve took 719.29 ms, sched copies = 4

Yes, there's a significant difference between the two images.

**The first image is upside down.** The second image is the right-side-up version of the same photograph.



It appears to be the same person wearing the same hat, but the orientation is completely reversed in the first image.

Metadata

Metadata

Type

No fields configured for Bug.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions