diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 8d8a1e67a5956..4994ce413c2a1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/MachineCycleAnalysis.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/ErrorHandling.h" @@ -1422,11 +1423,7 @@ bool PreRARematStage::initGCNSchedStage() { dbgs() << ")\n"; }); - if (AchievedOcc > DAG.MinOccupancy) { - DAG.MinOccupancy = AchievedOcc; - SIMachineFunctionInfo &MFI = *MF.getInfo(); - MFI.increaseOccupancy(MF, DAG.MinOccupancy); - } + DAG.setTargetOccupancy(getStageTargetOccupancy()); return true; } @@ -1537,10 +1534,8 @@ bool UnclusteredHighRPStage::initGCNRegion() { // occupancy changes in the DAG and MFI. if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { IsAnyRegionScheduled = true; - if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) { - DAG.MinOccupancy = TempTargetOccupancy; - MFI.increaseOccupancy(MF, TempTargetOccupancy); - } + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) + DAG.setTargetOccupancy(TempTargetOccupancy); } return IsSchedulingThisRegion; } @@ -1589,6 +1584,23 @@ void GCNSchedStage::finalizeGCNRegion() { SavedMutations.swap(DAG.Mutations); } +void PreRARematStage::finalizeGCNRegion() { + GCNSchedStage::finalizeGCNRegion(); + // When the goal is to increase occupancy, all regions must reach the target + // occupancy for rematerializations to be possibly useful, otherwise we will + // just hurt latency for no benefit. If minimum occupancy drops below the + // target there is no point in trying to re-schedule further regions. + if (!TargetOcc) + return; + RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore); + if (DAG.MinOccupancy < *TargetOcc) { + REMAT_DEBUG(dbgs() << "Region " << RegionIdx + << " cannot meet occupancy target, interrupting " + "re-scheduling in all regions\n"); + RescheduleRegions.reset(); + } +} + void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); @@ -1862,8 +1874,7 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { } bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { - return GCNSchedStage::shouldRevertScheduling(WavesAfter) || - mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc); + return mayCauseSpilling(WavesAfter); } bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { @@ -2487,6 +2498,10 @@ bool RewriteMFMAFormStage::rewrite( return true; } +unsigned PreRARematStage::getStageTargetOccupancy() const { + return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU(); +} + bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { const Function &F = MF.getFunction(); @@ -2828,13 +2843,31 @@ bool PreRARematStage::isReMaterializable(const MachineInstr &MI) { void PreRARematStage::finalizeGCNSchedStage() { // We consider that reducing spilling is always beneficial so we never - // rollback rematerializations in such cases. It's also possible that - // rescheduling lowers occupancy over the one achieved just through remats, in - // which case we do not want to rollback either (the rescheduling was already - // reverted in PreRARematStage::shouldRevertScheduling in such cases). - unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy); - if (!TargetOcc || MaxOcc >= *TargetOcc) + // rollback rematerializations or revert scheduling in such cases. + if (!TargetOcc) + return; + + // When increasing occupancy, it is possible that re-scheduling is not able to + // achieve the target occupancy in all regions, in which case re-scheduling in + // all regions should be reverted. + if (DAG.MinOccupancy >= *TargetOcc) + return; + for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) { + REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx + << '\n'); + DAG.Pressure[RegionIdx] = MaxPressure; + modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder); + } + + // It is possible that re-scheduling lowers occupancy over the one achieved + // just through rematerializations, in which case we revert re-scheduling in + // all regions but do not roll back rematerializations. + if (AchievedOcc >= *TargetOcc) { + DAG.setTargetOccupancy(AchievedOcc); return; + } + // Reset the target occupancy to what it was pre-rematerialization. + DAG.setTargetOccupancy(*TargetOcc - 1); REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n"); const SIInstrInfo *TII = MF.getSubtarget().getInstrInfo(); @@ -2903,6 +2936,14 @@ void GCNScheduleDAGMILive::updateRegionBoundaries( RegionBounds.first = NewMI; // Insertion } +void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) { + MinOccupancy = TargetOccupancy; + if (MFI.getOccupancy() < TargetOccupancy) + MFI.increaseOccupancy(MF, MinOccupancy); + else + MFI.limitOccupancy(MinOccupancy); +} + static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { const SIInstrInfo *SII = static_cast(DAG->TII); return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index dc85fef958c9a..d934dc90ebd92 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -313,6 +313,9 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { MachineBasicBlock::iterator MI, MachineInstr *NewMI); + /// Makes the scheduler try to achieve an occupancy of \p TargetOccupancy. + void setTargetOccupancy(unsigned TargetOccupancy); + void runSchedStages(); std::unique_ptr createSchedStage(GCNSchedStageID SchedStageID); @@ -555,9 +558,30 @@ class PreRARematStage : public GCNSchedStage { /// objective is spilling reduction. std::optional TargetOcc; /// Achieved occupancy *only* through rematerializations (pre-rescheduling). - /// Smaller than or equal to the target occupancy. unsigned AchievedOcc; + /// State of a region pre-re-scheduling but post-rematerializations that we + /// must keep to be able to revert re-scheduling effects. + struct RegionSchedRevert { + /// Region number; + unsigned RegionIdx; + /// Original instruction order (both debug and non-debug MIs). + std::vector OrigMIOrder; + /// Maximum pressure recorded in the region. + GCNRegPressure MaxPressure; + + RegionSchedRevert(unsigned RegionIdx, ArrayRef OrigMIOrder, + const GCNRegPressure &MaxPressure) + : RegionIdx(RegionIdx), OrigMIOrder(OrigMIOrder), + MaxPressure(MaxPressure) {} + }; + /// After re-scheduling, contains pre-re-scheduling data for all re-scheduled + /// regions. + SmallVector RegionReverts; + + /// Returns the occupancy the stage is trying to achieve. + unsigned getStageTargetOccupancy() const; + /// Returns whether remat can reduce spilling or increase function occupancy /// by 1 through rematerialization. If it can do one, collects instructions in /// PreRARematStage::Rematerializations and sets the target occupancy in @@ -582,6 +606,8 @@ class PreRARematStage : public GCNSchedStage { bool initGCNRegion() override; + void finalizeGCNRegion() override; + bool shouldRevertScheduling(unsigned WavesAfter) override; PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir index 371753801d1a3..050ff23cd1f27 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir @@ -10,6 +10,10 @@ define void @sink_and_inc_idx_when_skipping_small_regions_2() "amdgpu-flat-work-group-size"="1,64" { ret void } + + define void @test_occ_inc_revert_all_regions() { + ret void + } --- name: sink_and_inc_idx_when_skipping_small_region_1 tracksRegLiveness: true @@ -154,3 +158,117 @@ body: | S_NOP 0, implicit %22 S_ENDPGM 0 ... +# bb.1 cannot meet the occupancy target even by rematerializing %64 into it +# even though rematerialization heuristics believes it can; scheduling should +# be interrupted and reverted in all re-scheduled regions. +--- +name: test_occ_inc_revert_all_regions +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; DEBUG: Machine code for function test_occ_inc_revert_all_regions: IsSSA, NoPHIs, TracksLiveness + ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 7 from rematerializing (original was 7, target was 8) + ; DEBUG: Region 1 cannot meet occupancy target, interrupting re-scheduling in all regions + ; DEBUG: Reverting re-scheduling in region 0 + ; DEBUG: Reverting re-scheduling in region 1 + ; DEBUG-NOT: Reverting re-scheduling in region 3 + ; DEBUG-NOT: Reverting re-scheduling in region 4 + bb.0: + successors: %bb.1 + + %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0 + %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0 + %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0 + %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0 + %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0 + %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0 + %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0 + %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0 + %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0 + %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0 + %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0 + %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0 + %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0 + %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0 + %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0 + %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0 + %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0 + %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0 + %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0 + %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0 + %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 + %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 + %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 + %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 + %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 + %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0 + %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 + %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 + %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 + %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0 + %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0 + %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0 + + %64:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode + + bb.1: + successors: %bb.2 + + S_NOP 0, implicit %64 + + bb.2: + successors: %bb.3 + + S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7 + S_NOP 0, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15 + S_NOP 0, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23 + S_NOP 0, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31 + + bb.3: + successors: %bb.4 + + %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 + %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0 + %34:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0 + %35:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode, implicit-def $m0 + %36:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0 + %37:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 37, implicit $exec, implicit $mode, implicit-def $m0 + %38:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 38, implicit $exec, implicit $mode, implicit-def $m0 + %39:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 39, implicit $exec, implicit $mode, implicit-def $m0 + %40:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 40, implicit $exec, implicit $mode, implicit-def $m0 + %41:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 41, implicit $exec, implicit $mode, implicit-def $m0 + %42:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 42, implicit $exec, implicit $mode, implicit-def $m0 + %43:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 43, implicit $exec, implicit $mode, implicit-def $m0 + %44:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 44, implicit $exec, implicit $mode, implicit-def $m0 + %45:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 45, implicit $exec, implicit $mode, implicit-def $m0 + %46:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 46, implicit $exec, implicit $mode, implicit-def $m0 + %47:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 47, implicit $exec, implicit $mode, implicit-def $m0 + %48:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 48, implicit $exec, implicit $mode, implicit-def $m0 + %49:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 49, implicit $exec, implicit $mode, implicit-def $m0 + %50:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 50, implicit $exec, implicit $mode, implicit-def $m0 + %51:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 51, implicit $exec, implicit $mode, implicit-def $m0 + %52:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 52, implicit $exec, implicit $mode, implicit-def $m0 + %53:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 53, implicit $exec, implicit $mode, implicit-def $m0 + %54:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 54, implicit $exec, implicit $mode, implicit-def $m0 + %55:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 55, implicit $exec, implicit $mode, implicit-def $m0 + %56:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 56, implicit $exec, implicit $mode, implicit-def $m0 + %57:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 57, implicit $exec, implicit $mode, implicit-def $m0 + %58:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 58, implicit $exec, implicit $mode, implicit-def $m0 + %59:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 59, implicit $exec, implicit $mode, implicit-def $m0 + %60:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 60, implicit $exec, implicit $mode, implicit-def $m0 + %61:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 61, implicit $exec, implicit $mode, implicit-def $m0 + %62:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 62, implicit $exec, implicit $mode, implicit-def $m0 + %63:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 63, implicit $exec, implicit $mode, implicit-def $m0 + + %65:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 65, implicit $exec, implicit $mode + + bb.4: + S_NOP 0, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39 + S_NOP 0, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47 + S_NOP 0, implicit %48, implicit %49, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55 + S_NOP 0, implicit %56, implicit %57, implicit %58, implicit %59, implicit %60, implicit %61, implicit %62, implicit %63 + S_NOP 0, implicit %65 + + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index 8d24f6ba66968..1ef95deee29cc 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -352,8 +352,8 @@ body: | ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -510,8 +510,8 @@ body: | ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -665,8 +665,8 @@ body: | ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -678,8 +678,8 @@ body: | ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: @@ -862,8 +862,8 @@ body: | ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_24]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -876,8 +876,8 @@ body: | ; GFX908-NEXT: successors: %bb.4(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_25]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.4: @@ -3084,9 +3084,13 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 69 ; GFX908-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 70 ; GFX908-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 71 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 72 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 73 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 74 + ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 75 ; GFX908-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 76 ; GFX908-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 77 @@ -3096,11 +3100,7 @@ body: | ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 81 ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 82 ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 83 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 84 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.1: ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) @@ -4426,8 +4426,8 @@ body: | ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -4691,8 +4691,8 @@ body: | ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -4974,8 +4974,8 @@ body: | ; GFX908-NEXT: successors: %bb.3(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_35:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 36, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]] ; GFX908-NEXT: [[V_CVT_I32_F64_e32_36:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 35, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_35]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_36]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.3: @@ -8607,8 +8607,8 @@ body: | ; GFX908-NEXT: successors: %bb.2(0x80000000) ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: @@ -8760,8 +8760,8 @@ body: | ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0 ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0 - ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: [[V_CVT_F64_I32_e32_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode + ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]] ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_F64_I32_e32_]] ; GFX908-NEXT: {{ $}} ; GFX908-NEXT: bb.2: