diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 4c9f8c2723493..fef5f5a84d937 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -6,15 +6,17 @@ // //===----------------------------------------------------------------------===// // -// This pass tries to make consecutive compares of values use same operands to -// allow CSE pass to remove duplicated instructions. For this it analyzes -// branches and adjusts comparisons with immediate values by converting: -// * GE -> GT -// * GT -> GE -// * LT -> LE -// * LE -> LT -// and adjusting immediate values appropriately. It basically corrects two -// immediate values towards each other to make them equal. +// +// This pass tries to make consecutive comparisons of values use the same +// operands to allow the CSE pass to remove duplicate instructions. It adjusts +// comparisons with immediate values by converting between inclusive and +// exclusive forms (GE <-> GT, LE <-> LT) and correcting immediate values to +// make them equal. +// +// The pass handles: +// * Cross-block: SUBS/ADDS followed by conditional branches +// * Intra-block: CSINC conditional instructions +// // // Consider the following example in C: // @@ -49,11 +51,16 @@ // b.le .LBB0_6 // ... // -// Currently only SUBS and ADDS followed by b.?? are supported. +// See optimizeCrossBlock() and optimizeIntraBlock() for implementation details. // // TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0" -// TODO: handle other conditional instructions (e.g. CSET) -// TODO: allow second branching to be anything if it doesn't require adjusting +// TODO: For cross-block: +// - handle other conditional instructions (e.g. CSET) +// - allow second branching to be anything if it doesn't require adjusting +// TODO: For intra-block: +// - handle CINC and CSET (CSINC aliases) as their conditions are inverted +// compared to CSINC. +// - handle other non-CSINC conditional instructions // //===----------------------------------------------------------------------===// @@ -111,6 +118,9 @@ class AArch64ConditionOptimizer : public MachineFunctionPass { void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info); bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm); + bool isPureCmp(MachineInstr &CmpMI); + bool optimizeIntraBlock(MachineBasicBlock &MBB); + bool optimizeCrossBlock(MachineBasicBlock &HBB); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -323,125 +333,295 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, return false; } -bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" - << "********** Function: " << MF.getName() << '\n'); - if (skipFunction(MF.getFunction())) +bool AArch64ConditionOptimizer::isPureCmp(MachineInstr &CmpMI) { + unsigned ShiftAmt = AArch64_AM::getShiftValue(CmpMI.getOperand(3).getImm()); + if (!CmpMI.getOperand(2).isImm()) { + LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << CmpMI << '\n'); return false; + } else if (CmpMI.getOperand(2).getImm() << ShiftAmt >= 0xfff) { + LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << CmpMI + << '\n'); + return false; + } else if (!MRI->use_nodbg_empty(CmpMI.getOperand(0).getReg())) { + LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << CmpMI << '\n'); + return false; + } - TII = MF.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis().getDomTree(); - MRI = &MF.getRegInfo(); - - bool Changed = false; + return true; +} - // Visit blocks in dominator tree pre-order. The pre-order enables multiple - // cmp-conversions from the same head block. - // Note that updateDomTree() modifies the children of the DomTree node - // currently being visited. The df_iterator supports that; it doesn't look at - // child_begin() / child_end() until after a node has been visited. - for (MachineDomTreeNode *I : depth_first(DomTree)) { - MachineBasicBlock *HBB = I->getBlock(); +// This function transforms two CMP+CSINC pairs within the same basic block +// when both conditions are the same (GT/GT or LT/LT) and immediates differ +// by 1. +// +// Example transformation: +// cmp w8, #10 +// csinc w9, w0, w1, gt ; w9 = (w8 > 10) ? w0 : w1+1 +// cmp w8, #9 +// csinc w10, w0, w1, gt ; w10 = (w8 > 9) ? w0 : w1+1 +// +// Into: +// cmp w8, #10 +// csinc w9, w0, w1, gt ; w9 = (w8 > 10) ? w0 : w1+1 +// csinc w10, w0, w1, ge ; w10 = (w8 >= 10) ? w0 : w1+1 +// +// The second CMP is eliminated, enabling CSE to remove the redundant +// comparison. +bool AArch64ConditionOptimizer::optimizeIntraBlock(MachineBasicBlock &MBB) { + MachineInstr *FirstCmp = nullptr; + MachineInstr *FirstCSINC = nullptr; + MachineInstr *SecondCmp = nullptr; + MachineInstr *SecondCSINC = nullptr; + + // Find two CMP + CSINC pairs + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + // cmp is an alias for subs with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + // cmn is an alias for adds with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: { + if (!FirstCmp) { + FirstCmp = &MI; + } else if (FirstCSINC && !SecondCmp) { + SecondCmp = &MI; + } + break; + } - SmallVector HeadCond; - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) { - continue; + case AArch64::CSINCWr: + case AArch64::CSINCXr: { + // Found a CSINC, ensure it comes after the corresponding comparison + if (FirstCmp && !FirstCSINC) { + FirstCSINC = &MI; + } else if (SecondCmp && !SecondCSINC) { + SecondCSINC = &MI; + } + break; + } } - // Equivalence check is to skip loops. - if (!TBB || TBB == HBB) { - continue; + if (SecondCSINC) + break; + } + + if (!SecondCmp || !SecondCSINC) { + LLVM_DEBUG(dbgs() << "Didn't find two CMP+CSINC pairs\n"); + return false; + } + + if (FirstCmp->getOperand(1).getReg() != SecondCmp->getOperand(1).getReg()) { + LLVM_DEBUG(dbgs() << "CMPs compare different registers\n"); + return false; + } + + if (!isPureCmp(*FirstCmp) || !isPureCmp(*SecondCmp)) { + LLVM_DEBUG(dbgs() << "One or both CMPs are not pure\n"); + return false; + } + + // Check that nothing else modifies the flags between the first CMP and second + // conditional + for (auto It = std::next(MachineBasicBlock::iterator(FirstCmp)); + It != std::next(MachineBasicBlock::iterator(SecondCSINC)); ++It) { + if (&*It != SecondCmp && + It->modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) { + LLVM_DEBUG(dbgs() << "Flags modified between CMPs by: " << *It << '\n'); + return false; } + } - SmallVector TrueCond; - MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr; - if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { - continue; + // Check flags aren't read after second conditional within the same block + for (auto It = std::next(MachineBasicBlock::iterator(SecondCSINC)); + It != MBB.end(); ++It) { + if (It->readsRegister(AArch64::NZCV, /*TRI=*/nullptr)) { + LLVM_DEBUG(dbgs() << "Flags read after second CSINC by: " << *It << '\n'); + return false; } + } - MachineInstr *HeadCmpMI = findSuitableCompare(HBB); - if (!HeadCmpMI) { - continue; + // Since we may modify a cmp in this MBB, make sure NZCV does not live out. + for (auto *SuccBB : MBB.successors()) + if (SuccBB->isLiveIn(AArch64::NZCV)) + return false; + + // Extract condition codes from both CSINCs (operand 3) + AArch64CC::CondCode FirstCond = + (AArch64CC::CondCode)(int)FirstCSINC->getOperand(3).getImm(); + AArch64CC::CondCode SecondCond = + (AArch64CC::CondCode)(int)SecondCSINC->getOperand(3).getImm(); + + const int FirstImm = (int)FirstCmp->getOperand(2).getImm(); + const int SecondImm = (int)SecondCmp->getOperand(2).getImm(); + + LLVM_DEBUG(dbgs() << "Comparing intra-block CSINCs: " + << AArch64CC::getCondCodeName(FirstCond) << " #" << FirstImm + << " and " << AArch64CC::getCondCodeName(SecondCond) << " #" + << SecondImm << '\n'); + + // Check if both conditions are the same and immediates differ by 1 + if (((FirstCond == AArch64CC::GT && SecondCond == AArch64CC::GT) || + (FirstCond == AArch64CC::LT && SecondCond == AArch64CC::LT)) && + std::abs(SecondImm - FirstImm) == 1) { + // Pick which comparison to adjust to match the other + // For GT: adjust the one with smaller immediate + // For LT: adjust the one with larger immediate + bool adjustFirst = (FirstImm < SecondImm); + if (FirstCond == AArch64CC::LT) { + adjustFirst = !adjustFirst; } - MachineInstr *TrueCmpMI = findSuitableCompare(TBB); - if (!TrueCmpMI) { - continue; + MachineInstr *CmpToAdjust = adjustFirst ? FirstCmp : SecondCmp; + MachineInstr *CSINCToAdjust = adjustFirst ? FirstCSINC : SecondCSINC; + AArch64CC::CondCode CondToAdjust = adjustFirst ? FirstCond : SecondCond; + int TargetImm = adjustFirst ? SecondImm : FirstImm; + + CmpInfo AdjustedInfo = adjustCmp(CmpToAdjust, CondToAdjust); + + if (std::get<0>(AdjustedInfo) == TargetImm && + std::get<1>(AdjustedInfo) == + (adjustFirst ? SecondCmp : FirstCmp)->getOpcode()) { + LLVM_DEBUG(dbgs() << "Successfully optimizing intra-block CSINC pair\n"); + + // Modify the selected CMP and CSINC + CmpToAdjust->getOperand(2).setImm(std::get<0>(AdjustedInfo)); + CmpToAdjust->setDesc(TII->get(std::get<1>(AdjustedInfo))); + CSINCToAdjust->getOperand(3).setImm(std::get<2>(AdjustedInfo)); + + return true; } + } - AArch64CC::CondCode HeadCmp; - if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) { - continue; + return false; +} + +// Optimize across blocks +bool AArch64ConditionOptimizer::optimizeCrossBlock(MachineBasicBlock &HBB) { + SmallVector HeadCond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->analyzeBranch(HBB, TBB, FBB, HeadCond)) { + return false; + } + + // Equivalence check is to skip loops. + if (!TBB || TBB == &HBB) { + return false; + } + + SmallVector TrueCond; + MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr; + if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { + return false; + } + + MachineInstr *HeadCmpMI = findSuitableCompare(&HBB); + if (!HeadCmpMI) { + return false; + } + + MachineInstr *TrueCmpMI = findSuitableCompare(TBB); + if (!TrueCmpMI) { + return false; + } + + AArch64CC::CondCode HeadCmp; + if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) { + return false; + } + + AArch64CC::CondCode TrueCmp; + if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) { + return false; + } + + const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm(); + const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm(); + + LLVM_DEBUG(dbgs() << "Head branch:\n"); + LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(HeadCmp) + << '\n'); + LLVM_DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n'); + + LLVM_DEBUG(dbgs() << "True branch:\n"); + LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(TrueCmp) + << '\n'); + LLVM_DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n'); + + if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) || + (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) && + std::abs(TrueImm - HeadImm) == 2) { + // This branch transforms machine instructions that correspond to + // + // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...) + // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...) + // + // into + // + // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...) + // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...) + + CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp); + CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp); + if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) && + std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) { + modifyCmp(HeadCmpMI, HeadCmpInfo); + modifyCmp(TrueCmpMI, TrueCmpInfo); + return true; + } + } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) || + (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) && + std::abs(TrueImm - HeadImm) == 1) { + // This branch transforms machine instructions that correspond to + // + // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...) + // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...) + // + // into + // + // 1) (a <= {NewImm} && ...) || (a > {NewImm} && ...) + // 2) (a < {NewImm} && ...) || (a >= {NewImm} && ...) + + // GT -> GE transformation increases immediate value, so picking the + // smaller one; LT -> LE decreases immediate value so invert the choice. + bool adjustHeadCond = (HeadImm < TrueImm); + if (HeadCmp == AArch64CC::LT) { + adjustHeadCond = !adjustHeadCond; } - AArch64CC::CondCode TrueCmp; - if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) { - continue; + if (adjustHeadCond) { + return adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm); + } else { + return adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm); } + } + // Other transformation cases almost never occur due to generation of < or > + // comparisons instead of <= and >=. - const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm(); - const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm(); + return false; +} - LLVM_DEBUG(dbgs() << "Head branch:\n"); - LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(HeadCmp) - << '\n'); - LLVM_DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n'); +bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" + << "********** Function: " << MF.getName() << '\n'); + if (skipFunction(MF.getFunction())) + return false; - LLVM_DEBUG(dbgs() << "True branch:\n"); - LLVM_DEBUG(dbgs() << "\tcondition: " << AArch64CC::getCondCodeName(TrueCmp) - << '\n'); - LLVM_DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n'); - - if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) || - (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) && - std::abs(TrueImm - HeadImm) == 2) { - // This branch transforms machine instructions that correspond to - // - // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...) - // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...) - // - // into - // - // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...) - // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...) - - CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp); - CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp); - if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) && - std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) { - modifyCmp(HeadCmpMI, HeadCmpInfo); - modifyCmp(TrueCmpMI, TrueCmpInfo); - Changed = true; - } - } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) || - (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) && - std::abs(TrueImm - HeadImm) == 1) { - // This branch transforms machine instructions that correspond to - // - // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...) - // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...) - // - // into - // - // 1) (a <= {NewImm} && ...) || (a > {NewImm} && ...) - // 2) (a < {NewImm} && ...) || (a >= {NewImm} && ...) - - // GT -> GE transformation increases immediate value, so picking the - // smaller one; LT -> LE decreases immediate value so invert the choice. - bool adjustHeadCond = (HeadImm < TrueImm); - if (HeadCmp == AArch64CC::LT) { - adjustHeadCond = !adjustHeadCond; - } + TII = MF.getSubtarget().getInstrInfo(); + DomTree = &getAnalysis().getDomTree(); + MRI = &MF.getRegInfo(); - if (adjustHeadCond) { - Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm); - } else { - Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm); - } - } - // Other transformation cases almost never occur due to generation of < or > - // comparisons instead of <= and >=. + bool Changed = false; + + // Visit blocks in dominator tree pre-order. The pre-order enables multiple + // cmp-conversions from the same head block. + // Note that updateDomTree() modifies the children of the DomTree node + // currently being visited. The df_iterator supports that; it doesn't look at + // child_begin() / child_end() until after a node has been visited. + for (MachineDomTreeNode *I : depth_first(DomTree)) { + MachineBasicBlock *HBB = I->getBlock(); + Changed |= optimizeIntraBlock(*HBB); + Changed |= optimizeCrossBlock(*HBB); } return Changed; diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll index 6449c3e11d667..4449c2b9193a4 100644 --- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -7,6 +7,110 @@ @c = external global i32 @d = external global i32 + +; Test intra-block CSINC optimization with (a > 10) and (a >= 10) +; Two CSINC instructions should share a single CMP after optimization +define void @intra_block_csinc(i32 %x, i32 %y, ptr %out1, ptr %out2) #0 { +; CHECK-LABEL: intra_block_csinc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, :got:a +; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: cmp w8, #10 +; CHECK-NEXT: csinc w8, w0, w1, gt +; CHECK-NEXT: csinc w9, w0, w1, ge +; CHECK-NEXT: str w8, [x2] +; CHECK-NEXT: str w9, [x3] +; CHECK-NEXT: ret +entry: + %val = load i32, ptr @a, align 4 + + ; First: result1 = (a > 10) ? x : (y + 1) + %cond1 = icmp sgt i32 %val, 10 + %y_inc1 = add i32 %y, 1 + %result1 = select i1 %cond1, i32 %x, i32 %y_inc1 + store i32 %result1, ptr %out1 + + ; Second: result2 = (a >= 10) ? x : (y + 1) + ; Canonicalizes to (a > 9), then optimizes to reuse first CMP with adjusted condition + %cond2 = icmp sge i32 %val, 10 + %y_inc2 = add i32 %y, 1 + %result2 = select i1 %cond2, i32 %x, i32 %y_inc2 + store i32 %result2, ptr %out2 + + ret void +} + +; Negative test: different registers should not be optimized +define void @intra_block_csinc_different_regs(i32 %x, i32 %y, ptr %out1, ptr %out2) #0 { +; CHECK-LABEL: intra_block_csinc_different_regs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, :got:a +; CHECK-NEXT: adrp x9, :got:b +; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] +; CHECK-NEXT: ldr x9, [x9, :got_lo12:b] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: cmp w8, #10 +; CHECK-NEXT: csinc w8, w0, w1, gt +; CHECK-NEXT: cmp w9, #9 +; CHECK-NEXT: str w8, [x2] +; CHECK-NEXT: csinc w8, w0, w1, gt +; CHECK-NEXT: str w8, [x3] +; CHECK-NEXT: ret +entry: + %val1 = load i32, ptr @a, align 4 + %val2 = load i32, ptr @b, align 4 + + ; First: result1 = (a > 10) ? x : (y + 1) + %cond1 = icmp sgt i32 %val1, 10 + %y_inc1 = add i32 %y, 1 + %result1 = select i1 %cond1, i32 %x, i32 %y_inc1 + store i32 %result1, ptr %out1 + + ; Second: result2 = (b > 9) ? x : (y + 1) - compares DIFFERENT register + ; Should NOT optimize - need both CMPs + %cond2 = icmp sgt i32 %val2, 9 + %y_inc2 = add i32 %y, 1 + %result2 = select i1 %cond2, i32 %x, i32 %y_inc2 + store i32 %result2, ptr %out2 + + ret void +} + +; Test intra-block CSINC optimization with (a < 5) and (a < 6) +; LT/LT pattern - symmetric to GT/GT case +define void @intra_block_csinc_lt(i32 %x, i32 %y, ptr %out1, ptr %out2) #0 { +; CHECK-LABEL: intra_block_csinc_lt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, :got:a +; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: cmp w8, #5 +; CHECK-NEXT: csinc w8, w0, w1, lt +; CHECK-NEXT: csinc w9, w0, w1, le +; CHECK-NEXT: str w8, [x2] +; CHECK-NEXT: str w9, [x3] +; CHECK-NEXT: ret +entry: + %val = load i32, ptr @a, align 4 + + ; First: result1 = (a < 5) ? x : (y + 1) + %cond1 = icmp slt i32 %val, 5 + %y_inc1 = add i32 %y, 1 + %result1 = select i1 %cond1, i32 %x, i32 %y_inc1 + store i32 %result1, ptr %out1 + + ; Second: result2 = (a < 6) ? x : (y + 1) + ; Optimizes to reuse first CMP (#5) with adjusted condition (le) + %cond2 = icmp slt i32 %val, 6 + %y_inc2 = add i32 %y, 1 + %result2 = select i1 %cond2, i32 %x, i32 %y_inc2 + store i32 %result2, ptr %out2 + + ret void +} + ; (a > 10 && b == c) || (a >= 10 && b == d) define i32 @combine_gt_ge_10() #0 { ; CHECK-LABEL: combine_gt_ge_10: @@ -17,30 +121,30 @@ define i32 @combine_gt_ge_10() #0 { ; CHECK-NEXT: cmp w8, #10 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] -; CHECK-NEXT: b.le .LBB0_3 +; CHECK-NEXT: b.le .LBB3_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x9, :got:c ; CHECK-NEXT: ldr x9, [x9, :got_lo12:c] ; CHECK-NEXT: ldr w10, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w10, w9 -; CHECK-NEXT: b.ne .LBB0_4 +; CHECK-NEXT: b.ne .LBB3_4 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_3: // %lor.lhs.false -; CHECK-NEXT: b.lt .LBB0_6 -; CHECK-NEXT: .LBB0_4: // %land.lhs.true3 +; CHECK-NEXT: .LBB3_3: // %lor.lhs.false +; CHECK-NEXT: b.lt .LBB3_6 +; CHECK-NEXT: .LBB3_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB0_6 +; CHECK-NEXT: b.ne .LBB3_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_6: // %if.end +; CHECK-NEXT: .LBB3_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -80,7 +184,7 @@ define i32 @combine_gt_lt_5() #0 { ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, #5 -; CHECK-NEXT: b.le .LBB1_3 +; CHECK-NEXT: b.le .LBB4_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c @@ -89,12 +193,12 @@ define i32 @combine_gt_lt_5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB1_6 +; CHECK-NEXT: b.ne .LBB4_6 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_3: // %lor.lhs.false -; CHECK-NEXT: b.ge .LBB1_6 +; CHECK-NEXT: .LBB4_3: // %lor.lhs.false +; CHECK-NEXT: b.ge .LBB4_6 ; CHECK-NEXT: // %bb.4: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d @@ -103,11 +207,11 @@ define i32 @combine_gt_lt_5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB1_6 +; CHECK-NEXT: b.ne .LBB4_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_6: // %if.end +; CHECK-NEXT: .LBB4_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -149,30 +253,30 @@ define i32 @combine_lt_ge_5() #0 { ; CHECK-NEXT: cmp w8, #5 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] -; CHECK-NEXT: b.ge .LBB2_3 +; CHECK-NEXT: b.ge .LBB5_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x9, :got:c ; CHECK-NEXT: ldr x9, [x9, :got_lo12:c] ; CHECK-NEXT: ldr w10, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w10, w9 -; CHECK-NEXT: b.ne .LBB2_4 +; CHECK-NEXT: b.ne .LBB5_4 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_3: // %lor.lhs.false -; CHECK-NEXT: b.gt .LBB2_6 -; CHECK-NEXT: .LBB2_4: // %land.lhs.true3 +; CHECK-NEXT: .LBB5_3: // %lor.lhs.false +; CHECK-NEXT: b.gt .LBB5_6 +; CHECK-NEXT: .LBB5_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB2_6 +; CHECK-NEXT: b.ne .LBB5_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_6: // %if.end +; CHECK-NEXT: .LBB5_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -212,7 +316,7 @@ define i32 @combine_lt_gt_5() #0 { ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, #5 -; CHECK-NEXT: b.ge .LBB3_3 +; CHECK-NEXT: b.ge .LBB6_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c @@ -221,12 +325,12 @@ define i32 @combine_lt_gt_5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB3_6 +; CHECK-NEXT: b.ne .LBB6_6 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB3_3: // %lor.lhs.false -; CHECK-NEXT: b.le .LBB3_6 +; CHECK-NEXT: .LBB6_3: // %lor.lhs.false +; CHECK-NEXT: b.le .LBB6_6 ; CHECK-NEXT: // %bb.4: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d @@ -235,11 +339,11 @@ define i32 @combine_lt_gt_5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB3_6 +; CHECK-NEXT: b.ne .LBB6_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB3_6: // %if.end +; CHECK-NEXT: .LBB6_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -279,7 +383,7 @@ define i32 @combine_gt_lt_n5() #0 { ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmn w8, #5 -; CHECK-NEXT: b.le .LBB4_3 +; CHECK-NEXT: b.le .LBB7_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c @@ -288,12 +392,12 @@ define i32 @combine_gt_lt_n5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB4_6 +; CHECK-NEXT: b.ne .LBB7_6 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_3: // %lor.lhs.false -; CHECK-NEXT: b.ge .LBB4_6 +; CHECK-NEXT: .LBB7_3: // %lor.lhs.false +; CHECK-NEXT: b.ge .LBB7_6 ; CHECK-NEXT: // %bb.4: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d @@ -302,11 +406,11 @@ define i32 @combine_gt_lt_n5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB4_6 +; CHECK-NEXT: b.ne .LBB7_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_6: // %if.end +; CHECK-NEXT: .LBB7_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -346,7 +450,7 @@ define i32 @combine_lt_gt_n5() #0 { ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmn w8, #5 -; CHECK-NEXT: b.ge .LBB5_3 +; CHECK-NEXT: b.ge .LBB8_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c @@ -355,12 +459,12 @@ define i32 @combine_lt_gt_n5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB5_6 +; CHECK-NEXT: b.ne .LBB8_6 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_3: // %lor.lhs.false -; CHECK-NEXT: b.le .LBB5_6 +; CHECK-NEXT: .LBB8_3: // %lor.lhs.false +; CHECK-NEXT: b.le .LBB8_6 ; CHECK-NEXT: // %bb.4: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d @@ -369,11 +473,11 @@ define i32 @combine_lt_gt_n5() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB5_6 +; CHECK-NEXT: b.ne .LBB8_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_6: // %if.end +; CHECK-NEXT: .LBB8_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -428,19 +532,19 @@ define void @combine_non_adjacent_cmp_br(ptr nocapture readonly %hdCall) #0 { ; CHECK-NEXT: mov w19, #24 // =0x18 ; CHECK-NEXT: adrp x22, glob ; CHECK-NEXT: add x21, x20, #2 -; CHECK-NEXT: .LBB6_1: // %land.rhs +; CHECK-NEXT: .LBB9_1: // %land.rhs ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr x8, [x19] ; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: b.lt .LBB6_3 +; CHECK-NEXT: b.lt .LBB9_3 ; CHECK-NEXT: // %bb.2: // %while.body -; CHECK-NEXT: // in Loop: Header=BB6_1 Depth=1 +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 ; CHECK-NEXT: ldr x0, [x22, :lo12:glob] ; CHECK-NEXT: bl Update ; CHECK-NEXT: sub x21, x21, #2 ; CHECK-NEXT: cmp x20, x21 -; CHECK-NEXT: b.lt .LBB6_1 -; CHECK-NEXT: .LBB6_3: // %while.end +; CHECK-NEXT: b.lt .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %while.end ; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload @@ -488,19 +592,19 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { ; CHECK-NEXT: ldr x19, [x19, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x19] ; CHECK-NEXT: cmn w8, #2 -; CHECK-NEXT: b.gt .LBB7_4 +; CHECK-NEXT: b.gt .LBB10_4 ; CHECK-NEXT: // %bb.1: // %while.body.preheader ; CHECK-NEXT: sub w20, w8, #1 -; CHECK-NEXT: .LBB7_2: // %while.body +; CHECK-NEXT: .LBB10_2: // %while.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl do_something ; CHECK-NEXT: adds w20, w20, #1 -; CHECK-NEXT: b.mi .LBB7_2 +; CHECK-NEXT: b.mi .LBB10_2 ; CHECK-NEXT: // %bb.3: // %while.cond.while.end_crit_edge ; CHECK-NEXT: ldr w8, [x19] -; CHECK-NEXT: .LBB7_4: // %while.end +; CHECK-NEXT: .LBB10_4: // %while.end ; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: b.gt .LBB7_7 +; CHECK-NEXT: b.gt .LBB10_7 ; CHECK-NEXT: // %bb.5: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d @@ -509,13 +613,13 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB7_7 +; CHECK-NEXT: b.ne .LBB10_7 ; CHECK-NEXT: // %bb.6: ; CHECK-NEXT: mov w0, #123 // =0x7b -; CHECK-NEXT: b .LBB7_8 -; CHECK-NEXT: .LBB7_7: // %if.end +; CHECK-NEXT: b .LBB10_8 +; CHECK-NEXT: .LBB10_7: // %if.end ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: .LBB7_8: // %return +; CHECK-NEXT: .LBB10_8: // %return ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 @@ -573,20 +677,20 @@ define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 { ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: b.gt .LBB8_3 +; CHECK-NEXT: b.gt .LBB11_3 ; CHECK-NEXT: // %bb.1: // %while.body.preheader ; CHECK-NEXT: sub w19, w8, #1 -; CHECK-NEXT: .LBB8_2: // %while.body +; CHECK-NEXT: .LBB11_2: // %while.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl do_something ; CHECK-NEXT: adds w19, w19, #1 -; CHECK-NEXT: b.mi .LBB8_2 -; CHECK-NEXT: .LBB8_3: // %while.end +; CHECK-NEXT: b.mi .LBB11_2 +; CHECK-NEXT: .LBB11_3: // %while.end ; CHECK-NEXT: adrp x8, :got:c ; CHECK-NEXT: ldr x8, [x8, :got_lo12:c] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmn w8, #2 -; CHECK-NEXT: b.lt .LBB8_6 +; CHECK-NEXT: b.lt .LBB11_6 ; CHECK-NEXT: // %bb.4: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d @@ -595,7 +699,7 @@ define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB8_6 +; CHECK-NEXT: b.ne .LBB11_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #123 // =0x7b ; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload @@ -603,7 +707,7 @@ define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 { ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w30 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB8_6: // %if.end +; CHECK-NEXT: .LBB11_6: // %if.end ; CHECK-NEXT: .cfi_restore_state ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload @@ -659,14 +763,14 @@ define i32 @fcmpri(i32 %argc, ptr nocapture readonly %argv) #0 { ; CHECK-LABEL: fcmpri: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w0, #2 -; CHECK-NEXT: b.lt .LBB9_3 +; CHECK-NEXT: b.lt .LBB12_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: ldr x8, [x1, #8] -; CHECK-NEXT: cbz x8, .LBB9_3 +; CHECK-NEXT: cbz x8, .LBB12_3 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #3 // =0x3 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB9_3: // %if.end +; CHECK-NEXT: .LBB12_3: // %if.end ; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill @@ -782,12 +886,12 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csel x9, x0, xzr, gt ; CHECK-NEXT: str x9, [x1] -; CHECK-NEXT: b.le .LBB11_2 +; CHECK-NEXT: b.le .LBB14_2 ; CHECK-NEXT: // %bb.1: // %lor.lhs.false ; CHECK-NEXT: cmp w8, #2 -; CHECK-NEXT: b.ge .LBB11_4 -; CHECK-NEXT: b .LBB11_6 -; CHECK-NEXT: .LBB11_2: // %land.lhs.true +; CHECK-NEXT: b.ge .LBB14_4 +; CHECK-NEXT: b .LBB14_6 +; CHECK-NEXT: .LBB14_2: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] @@ -795,11 +899,11 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB11_4 +; CHECK-NEXT: b.ne .LBB14_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_4: // %land.lhs.true3 +; CHECK-NEXT: .LBB14_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] @@ -807,11 +911,11 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB11_6 +; CHECK-NEXT: b.ne .LBB14_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_6: // %if.end +; CHECK-NEXT: .LBB14_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: