It looks like LoopVectorizePass introduces changes that uncover a bug in the backend. The transformation was verified with alive2. I've attached C++ and LLVM IR reproducers. C++ reproducer: // func.cpp extern int var_3; extern bool var_23; extern int arr_12[]; extern short arr_13[]; void test() { #pragma clang loop vectorize_predicate(enable) for (char a = 4; a < var_3; a++) { arr_13[a] = arr_12[a - 3]; var_23 = arr_12[a - 1]; } } // driver.cpp #include <stdio.h> int var_3 = 24; bool var_23 = 1; int arr_12 [25]; unsigned short arr_13 [25]; void test(); int main() { for (size_t i_0 = 0; i_0 < 25; ++i_0) arr_12 [i_0] = 1; test(); printf("%d\n", (int)var_23); } >$ clang++ -O0 -march=skx func.cpp driver.cpp && sde -skx -- ./a.out 1 >$ clang++ -O1 -march=skx func.cpp driver.cpp && sde -skx -- ./a.out 0 LLVM IR Reproducer: ; ModuleID = 'func.cpp' source_filename = "func.cpp" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @var_3 = external dso_local local_unnamed_addr global i32, align 4 @arr_12 = external dso_local local_unnamed_addr global [0 x i32], align 4 @arr_13 = external dso_local local_unnamed_addr global [0 x i16], align 2 @var_23 = external dso_local local_unnamed_addr global i8, align 1 ; Function Attrs: mustprogress nofree norecurse nosync nounwind uwtable define dso_local void @_Z4testv() local_unnamed_addr #0 { entry: %0 = load i32, i32* @var_3, align 4, !tbaa !3 %cmp13 = icmp sgt i32 %0, 4 br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry br label %for.body for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body %conv15.lcssa = phi i32 [ %conv15, %for.body ] %sub6 = add nsw i32 %conv15.lcssa, -1 %idxprom7 = sext i32 %sub6 to i64 %arrayidx8 = getelementptr inbounds [0 x i32], [0 x i32]* @arr_12, i64 0, i64 %idxprom7 %1 = load i32, i32* %arrayidx8, align 4, !tbaa !3 %tobool = icmp ne i32 %1, 0 %frombool = zext i1 %tobool to i8 store i8 %frombool, i8* @var_23, align 1, !tbaa !7 br label %for.cond.cleanup for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry ret void for.body: ; preds = %for.body.preheader, %for.body %conv15 = phi i32 [ %conv, %for.body ], [ 4, %for.body.preheader ] %a.014 = phi i8 [ %inc, %for.body ], [ 4, %for.body.preheader ] %sub = add nsw i32 %conv15, -3 %idxprom = sext i32 %sub to i64 %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @arr_12, i64 0, i64 %idxprom %2 = load i32, i32* %arrayidx, align 4, !tbaa !3 %conv2 = trunc i32 %2 to i16 %idxprom3 = sext i8 %a.014 to i64 %arrayidx4 = getelementptr inbounds [0 x i16], [0 x i16]* @arr_13, i64 0, i64 %idxprom3 store i16 %conv2, i16* %arrayidx4, align 2, !tbaa !9 %inc = add i8 %a.014, 1 %conv = sext i8 %inc to i32 %cmp = icmp sgt i32 %0, %conv br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge, !llvm.loop !11 } attributes #0 = { mustprogress nofree norecurse nosync nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="skx" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 7, !"uwtable", i32 1} !2 = !{!"clang version 14.0.0 (https://github.com/llvm/llvm-project.git 2d77b272a8f9b5b89b022628ca30b6b896a8f725)"} !3 = !{!4, !4, i64 0} !4 = !{!"int", !5, i64 0} !5 = !{!"omnipotent char", !6, i64 0} !6 = !{!"Simple C++ TBAA"} !7 = !{!8, !8, i64 0} !8 = !{!"bool", !5, i64 0} !9 = !{!10, !10, i64 0} !10 = !{!"short", !5, i64 0} !11 = distinct !{!11, !12, !13, !14, !15} !12 = !{!"llvm.loop.mustprogress"} !13 = !{!"llvm.loop.unroll.disable"} !14 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !15 = !{!"llvm.loop.vectorize.enable", i1 true} >$ clang++ -O0 ok.ll driver.cpp && sde -skx -- ./a.out 1 >$ opt -loop-vectorize ok.ll > opt.ll && clang++ -O0 opt.ll driver.cpp && sde -skx -- ./a.out 0 LLVM version: clang version 14.0.0 (https://github.com/llvm/llvm-project.git 2d77b272a8f9b5b89b022628ca30b6b896a8f725)
Gdb to the binary, the bad one has 4 extra offset than the good one to arr_12 in the last assignment to var_23. By reading the assembly, I found the dubious code snippet: mov -0x2c(%rsp),%eax sub $0x1,%eax add $0x4,%eax ; <-- Here the add seems superfluous ... mov %eax,-0x60(%rsp) ... mov -0x60(%rsp),%eax add $0xffffffff,%eax cltq cmpl $0x0,0x601040(,%rax,4) ; <-- rax here actually is "a - 1" The MIR after ISel is: bb.5.middle.block: ; predecessors: %bb.4 successors: %bb.7, %bb.6 %84:gr32 = SUB32ri8 %2:gr32(tied-def 0), 1, implicit-def $eflags %83:gr32 = ADD32ri8 %84:gr32(tied-def 0), 4, implicit-def $eflags %81:gr8 = MOV8ri 1 TEST8ri %81:gr8, 1, implicit-def $eflags JCC_1 %bb.7, 5, implicit $eflags But I haven't found the corresponding BB during ISel from the log. The only one has ADD32ri8 is below. But I didn't find any clue on the relationship. ===== Instruction selection ends: Selected selection DAG: %bb.4 '_Z4testv:vector.body' SelectionDAG has 29 nodes: t0: ch = EntryToken t3: i32,ch = CopyFromReg t0, Register:i32 %6 t67: i32 = Register $noreg t4: i32,i32 = ADD32ri8 t3, TargetConstant:i32<4> t19: ch = CopyToReg t0, Register:i32 %74, t4 t20: i8 = EXTRACT_SUBREG t3, TargetConstant:i32<1> t22: i8,i32 = ADD8ri t20, TargetConstant:i8<4> t36: ch = CopyToReg t0, Register:i8 %60, t22 t59: v8i32 = VPBROADCASTDrZ256rr t3 t56: v8i32,ch = VMOVDQA64Z256rm<Mem:(load (s256) from constant-pool)> Register:i64 $rip, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstantPool:i32<<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>> 0, Register:i16 $noreg, t0 t39: v8i32 = VPADDDZ256rr t59, t56 t41: v8i32,ch = CopyFromReg t0, Register:v8i32 %5 t43: v8i1 = VPCMPUDZ256rri t39, t41, TargetConstant:i8<2> t45: ch = CopyToReg t0, Register:v8i1 %52, t43 t46: ch = TokenFactor t19, t36, t45
I found the dubious "sub 1" and "add 4" in the opt.ll. Backend doesn't do anything special for them: middle.block: ; preds = %vector.body %cast.cmo = sub i32 %n.vec, 1 %ind.escape = add i32 4, %cast.cmo br i1 true, label %for.cond.for.cond.cleanup_crit_edge, label %scalar.ph And there is *no* relevance to skx. Modified the attributes in opt.ll to other targets can reproduce it too. @Vsevolod, how did you verify it with alive2? Is it possible the problem of LoopVectorizePass?
> And there is *no* relevance to skx. Modified the attributes in opt.ll to > other targets can reproduce it too. That is weird. The initial C++ error is reproducible for skx only. There might be multiple bugs in the same pass, and they switch between each other if we alter optimization sequence. > @Vsevolod, how did you verify it with alive2? I used alive2 translation validation mode on files before and after LoopVectorizePass. >$ opt -loop-vectorize -S before.ll > after.ll). >$ alive-tv before.ll after.ll ... Transformation seems to be correct! Summary: 1 correct transformations 0 incorrect transformations 0 failed-to-prove transformations 0 Alive2 errors > Is it possible the problem of LoopVectorizePass? LoopVectorizePass is certainly involved in this bug, but I don't know exactly how. It exposes wrong result, but that might be caused by error in pass itself, backend, or interaction with other optimizations.