Fix GVN and SROA miscompilation of min precision vector element access

alsepkow · Copilot · alsepkow · commit b34136b9a046 · 2026-03-13T23:44:21.000-07:00
Multiple optimization passes mishandle min precision vector types due to DXC's padded data layout (i16:32, f16:32), where getTypeSizeInBits returns padded sizes for vectors but primitive sizes for scalars. Bug 1 - GVN ICE: CanCoerceMustAliasedValueToLoad computes a padded integer type (e.g., i96 for <3 x half>) then attempts to bitcast from the 48-bit LLVM type, triggering an assert. Fix: reject coercion when type sizes include padding. Bug 2 - GVN incorrect store forwarding: processLoad forwards a 'store <3 x i16> zeroinitializer' past partial element stores because MemoryDependence uses padded sizes for aliasing. Fix: skip store-to-load forwarding for padded types. Bug 3 - SROA element misindexing: getNaturalGEPRecursively uses getTypeSizeInBits (2 bytes for i16) for element offsets while GEP uses getTypeAllocSize (4 bytes with i16:32). Byte offset 4 (element 1) maps to index 4/2=2 instead of 4/4=1, causing SROA to misplace or eliminate element stores. Fix: use getTypeAllocSizeInBits consistently for vector element sizes throughout SROA. Fixes microsoft#8268 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
@@ -853,6 +853,20 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
       StoredVal->getType()->isArrayTy())
     return false;
 
+  // HLSL Change Begin - Don't coerce types that have padding in the data
+  // layout (e.g., min precision types where f16:32 means half is stored in 32
+  // bits). The coercion creates bitcasts between the LLVM type (based on
+  // primitive bit width) and an integer type (based on padded store size),
+  // which will fail when they differ.
+  Type *StoredValTy = StoredVal->getType();
+  uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits();
+  uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits();
+  if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits)
+    return false;
+  if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
+    return false;
+  // HLSL Change End
+
   // The store has to be at least as big as the load.
   if (DL.getTypeSizeInBits(StoredVal->getType()) <
         DL.getTypeSizeInBits(LoadTy))
@@ -1942,6 +1956,17 @@ bool GVN::processLoad(LoadInst *L) {
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
 
+    // HLSL Change Begin - Don't forward stores of types with data layout
+    // padding (e.g., min precision vectors where i16:32/f16:32 means elements
+    // are padded to 32 bits). MemoryDependence may incorrectly classify
+    // intermediate partial stores as non-clobbering when sizes include padding,
+    // leading to incorrect store-to-load forwarding.
+    Type *StoredTy = StoredVal->getType();
+    uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
+    if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
+      return false;
+    // HLSL Change End
+
     // The store and load are to a must-aliased pointer, but they may not
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
@@ -1671,7 +1671,11 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    // HLSL Change: Use alloc size instead of primitive type size for vector
+    // elements. DXC's data layout pads min precision types (i16:32, f16:32),
+    // so getTypeAllocSize matches the GEP offset stride while
+    // getTypeSizeInBits returns the unpadded primitive width.
+    unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
@@ -2134,7 +2138,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
 
   // Try each vector type, and return the one which works.
   auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+    // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+    uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
 
     // While the definition of LLVM vectors is bitpacked, we don't support sizes
     // that aren't byte sized.
@@ -2492,12 +2497,13 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
                   : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+        ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
-      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+      assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
       ++NumVectorized;
     }