Conversation
…ector-bits=128." Reapplies llvm#134068 The first patch was missing a check to prevent attempts to pair SVE fill/spill with other Neon load/store instructions, which could happen if the Neon instruction was unscaled.
Also adds a test to make sure this doesn't happen.
Member
|
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesReapplies #134068. The first patch was missing a check to prevent attempts to pair SVE fill/spill with other Neon load/store instructions, which could happen specifically if the Neon instruction was unscaled. I've also added a new test to check against possible attempts to pair SVE LDR/STR with Neon instructions. Patch is 21.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135177.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d370f8c7ff6ea..74217fad82a7e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2760,6 +2760,9 @@ bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
case AArch64::LDRXpre:
case AArch64::LDURSWi:
case AArch64::LDRSWpre:
+ // SVE instructions.
+ case AArch64::LDR_ZXI:
+ case AArch64::STR_ZXI:
return true;
}
}
@@ -2912,6 +2915,18 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
return false;
}
+ // Pairing SVE fills/spills is only valid for little-endian targets that
+ // implement VLS 128.
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::LDR_ZXI:
+ case AArch64::STR_ZXI:
+ if (!Subtarget.isLittleEndian() ||
+ Subtarget.getSVEVectorSizeInBits() != 128)
+ return false;
+ }
+
// Check if this load/store has a hint to avoid pair formation.
// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
if (isLdStPairSuppressed(MI))
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 06e633effe874..7c47492cf1a8e 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -298,6 +298,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::STRXui:
case AArch64::STRXpre:
case AArch64::STURXi:
+ case AArch64::STR_ZXI:
case AArch64::LDRDui:
case AArch64::LDURDi:
case AArch64::LDRDpre:
@@ -316,6 +317,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
case AArch64::LDRSui:
case AArch64::LDURSi:
case AArch64::LDRSpre:
+ case AArch64::LDR_ZXI:
return Opc;
case AArch64::LDRSWui:
return AArch64::LDRWui;
@@ -361,6 +363,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
return AArch64::STPDpre;
case AArch64::STRQui:
case AArch64::STURQi:
+ case AArch64::STR_ZXI:
return AArch64::STPQi;
case AArch64::STRQpre:
return AArch64::STPQpre;
@@ -386,6 +389,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
return AArch64::LDPDpre;
case AArch64::LDRQui:
case AArch64::LDURQi:
+ case AArch64::LDR_ZXI:
return AArch64::LDPQi;
case AArch64::LDRQpre:
return AArch64::LDPQpre;
@@ -1225,6 +1229,16 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
(void)MIBSXTW;
LLVM_DEBUG(dbgs() << " Extend operand:\n ");
LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+ } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
+ // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
+ // variant of the registers.
+ MachineOperand &MOp0 = MIB->getOperand(0);
+ MachineOperand &MOp1 = MIB->getOperand(1);
+ assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
+ AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
+ MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
+ MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
+ LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
} else {
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
}
@@ -1499,6 +1513,12 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
if (OpcA == OpcB)
return !AArch64InstrInfo::isPreLdSt(FirstMI);
+ // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
+ // allow pairing them with other instructions.
+ if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
+ OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
+ return false;
+
// Two pre ld/st of different opcodes cannot be merged either
if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
return false;
@@ -2659,7 +2679,8 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
// Get the needed alignments to check them if
// ldp-aligned-only/stp-aligned-only features are opted.
uint64_t MemAlignment = MemOp->getAlign().value();
- uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
+ uint64_t TypeAlignment =
+ Align(MemOp->getSize().getValue().getKnownMinValue()).value();
if (MemAlignment < 2 * TypeAlignment) {
NumFailedAlignmentCheck++;
@@ -2820,11 +2841,18 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
}
// 3) Find loads and stores that can be merged into a single load or store
// pair instruction.
+ // When compiling for SVE 128, also try to combine SVE fill/spill
+ // instructions into LDP/STP.
// e.g.,
// ldr x0, [x2]
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
+ // e.g.,
+ // ldr z0, [x2]
+ // ldr z1, [x2, #1, mul vl]
+ // ; becomes
+ // ldp q0, q1, [x2]
if (MBB.getParent()->getRegInfo().tracksLiveness()) {
DefinedInBB.clear();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll
new file mode 100644
index 0000000000000..503ead4eba2db
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-fill-spill-pair.ll
@@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,ldp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-LDPALIGNEDONLY
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,stp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-STPALIGNEDONLY
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK-OFF
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 < %s | FileCheck %s --check-prefixes=CHECK-OFF
+
+define void @nxv16i8(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: stp q0, q1, [x1]
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: nxv16i8:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ptrue p0.b
+; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
+; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, #1, mul vl]
+; CHECK-BE-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #1, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0]
+; CHECK-OFF-NEXT: ldr z1, [x0, #1, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1]
+; CHECK-OFF-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %vl = shl nuw nsw i64 %vscale, 4
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv16i8_max_range(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_max_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0, #-1024]
+; CHECK-NEXT: stp q0, q1, [x1, #1008]
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: nxv16i8_max_range:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: rdvl x8, #1
+; CHECK-BE-NEXT: mov x9, #-1008 // =0xfffffffffffffc10
+; CHECK-BE-NEXT: mov x10, #-1024 // =0xfffffffffffffc00
+; CHECK-BE-NEXT: lsr x8, x8, #4
+; CHECK-BE-NEXT: mov w11, #1008 // =0x3f0
+; CHECK-BE-NEXT: mov w12, #1024 // =0x400
+; CHECK-BE-NEXT: ptrue p0.b
+; CHECK-BE-NEXT: mul x9, x8, x9
+; CHECK-BE-NEXT: mul x10, x8, x10
+; CHECK-BE-NEXT: mul x11, x8, x11
+; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
+; CHECK-BE-NEXT: mul x8, x8, x12
+; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0, x10]
+; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1, x11]
+; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, x8]
+; CHECK-BE-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_max_range:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0, #-64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #-63, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1, #1008]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_max_range:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0, #-1024]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1, #63, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8_max_range:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0, #-64, mul vl]
+; CHECK-OFF-NEXT: ldr z1, [x0, #-63, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1, #63, mul vl]
+; CHECK-OFF-NEXT: str z1, [x1, #64, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %ldoff1 = mul i64 %vscale, -1024
+ %ldoff2 = mul i64 %vscale, -1008
+ %stoff1 = mul i64 %vscale, 1008
+ %stoff2 = mul i64 %vscale, 1024
+ %ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
+ %stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv16i8_outside_range(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_outside_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: nxv16i8_outside_range:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: rdvl x8, #1
+; CHECK-BE-NEXT: mov x9, #-1040 // =0xfffffffffffffbf0
+; CHECK-BE-NEXT: mov x10, #-1024 // =0xfffffffffffffc00
+; CHECK-BE-NEXT: lsr x8, x8, #4
+; CHECK-BE-NEXT: mov w11, #1024 // =0x400
+; CHECK-BE-NEXT: mov w12, #1040 // =0x410
+; CHECK-BE-NEXT: ptrue p0.b
+; CHECK-BE-NEXT: mul x9, x8, x9
+; CHECK-BE-NEXT: mul x10, x8, x10
+; CHECK-BE-NEXT: mul x11, x8, x11
+; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
+; CHECK-BE-NEXT: mul x8, x8, x12
+; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, x10]
+; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1, x11]
+; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, x8]
+; CHECK-BE-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_outside_range:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_outside_range:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8_outside_range:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0, #-65, mul vl]
+; CHECK-OFF-NEXT: ldr z1, [x0, #-64, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1, #64, mul vl]
+; CHECK-OFF-NEXT: str z1, [x1, #65, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %ldoff1 = mul i64 %vscale, -1040
+ %ldoff2 = mul i64 %vscale, -1024
+ %stoff1 = mul i64 %vscale, 1024
+ %stoff2 = mul i64 %vscale, 1040
+ %ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
+ %stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv16i8_2vl_stride(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv16i8_2vl_stride:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: nxv16i8_2vl_stride:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ptrue p0.b
+; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, #2, mul vl]
+; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, #2, mul vl]
+; CHECK-BE-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: str z0, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldr z0, [x0]
+; CHECK-STPALIGNEDONLY-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1]
+; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv16i8_2vl_stride:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0]
+; CHECK-OFF-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1]
+; CHECK-OFF-NEXT: str z1, [x1, #2, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %vl = shl nuw nsw i64 %vscale, 5
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
+ store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
+ ret void
+}
+
+define void @nxv2f64_32b_aligned(ptr %ldptr, ptr %stptr) {
+; CHECK-LABEL: nxv2f64_32b_aligned:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp q0, q1, [x0]
+; CHECK-NEXT: stp q0, q1, [x1]
+; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: nxv2f64_32b_aligned:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ptrue p0.d
+; CHECK-BE-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-BE-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-BE-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-BE-NEXT: st1d { z1.d }, p0, [x1, #1, mul vl]
+; CHECK-BE-NEXT: ret
+;
+; CHECK-LDPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
+; CHECK-LDPALIGNEDONLY: // %bb.0:
+; CHECK-LDPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
+; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1]
+; CHECK-LDPALIGNEDONLY-NEXT: ret
+;
+; CHECK-STPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
+; CHECK-STPALIGNEDONLY: // %bb.0:
+; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
+; CHECK-STPALIGNEDONLY-NEXT: stp q0, q1, [x1]
+; CHECK-STPALIGNEDONLY-NEXT: ret
+;
+; CHECK-OFF-LABEL: nxv2f64_32b_aligned:
+; CHECK-OFF: // %bb.0:
+; CHECK-OFF-NEXT: ldr z0, [x0]
+; CHECK-OFF-NEXT: ldr z1, [x0, #1, mul vl]
+; CHECK-OFF-NEXT: str z0, [x1]
+; CHECK-OFF-NEXT: str z1, [x1, #1, mul vl]
+; CHECK-OFF-NEXT: ret
+ %vscale = tail call i64 @llvm.vscale()
+ %vl = shl nuw nsw i64 %vscale, 4
+ %ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
+ %stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
+ %ld1 = load <vscale x 2 x double>, ptr %ldptr, align 32
+ %ld2 = load <vscale x 2 x double>, ptr %ldptr2, align 32
+ store <vscale x 2 x double> %ld1, ptr %stptr, align 32
+ store <vscale x 2 x double> %ld2, ptr %stptr2, align 32
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-vls-ldst-opt.mir b/llvm/test/CodeGen/AArch64/sve-vls-ldst-opt.mir
new file mode 100644
index 0000000000000..49453bc178914
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vls-ldst-opt.mir
@@ -0,0 +1,74 @@
+# RUN: llc -mtriple=aarch64-unknown-linux -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 -run-pass=aarch64-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: pair-sve-fill-spill
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+ renamable $z1 = LDR_ZXI killed renamable $x0, 1 :: (load (<vscale x 1 x s128>))
+ STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+ STR_ZXI killed renamable $z1, killed renamable $x1, 1 :: (store (<vscale x 1 x s128>))
+ RET_ReallyLR
+...
+# CHECK-LABEL: name: pair-sve-fill-spill
+# CHECK: $q0, $q1 = LDPQi renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+# CHECK: STPQi killed $q0, killed $q1, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+---
+name: do-not-pair-sve-with-neon-scaled
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ ; SVE LDR + Neon LDR
+ renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+ renamable $q1 = LDRQui renamable $x0, 1 :: (load (s128))
+ ; Neon LDR + SVE LDR
+ renamable $q2 = LDRQui renamable $x0, 3 :: (load (s128))
+ renamable $z3 = LDR_ZXI renamable $x0, 4 :: (load (<vscale x 1 x s128>))
+ ; SVE STR + Neon STR
+ STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+ STRQui killed renamable $q1, renamable $x1, 1 :: (store (s128))
+ ; Neon STR + SVE STR
+ STRQui killed renamable $q2, renamable $x1, 3 :: (store (s128))
+ STR_ZXI killed renamable $z3, renamable $x1, 4 :: (store (<vscale x 1 x s128>))
+ RET_ReallyLR
+...
+# CHECK-LABEL: name: do-not-pair-sve-with-neon-scaled
+# CHECK: renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+# CHECK: renamable $q1 = LDRQui renamable $x0, 1 :: (load (s128))
+# CHECK: renamable $q2 = LDRQui renamable $x0, 3 :: (load (s128))
+# CHECK: renamable $z3 = LDR_ZXI renamable $x0, 4 :: (load (<vscale x 1 x s128>))
+# CHECK: STR_ZXI killed renamable $z0, renamable $x1, 0 :: (store (<vscale x 1 x s128>))
+# CHECK: STRQui killed renamable $q1, renamable $x1, 1 :: (store (s128))
+# CHECK: STRQui killed renamable $q2, renamable $x1, 3 :: (store (s128))
+# CHECK: STR_ZXI killed renamable $z3, renamable $x1, 4 :: (store (<vscale x 1 x s128>))
+---
+name: do-not-pair-sve-with-neon-unscaled
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ ; SVE LDR + Neon LDUR
+ renamable $z0 = LDR_ZXI renamable $x0, 0 :: (load (<vscale x 1 x s128>))
+ renamable $q1 = LDURQi renamable $x0, 16 :: (load (s128))
+ ; Neon LDUR + SVE LDR
+ renam...
[truncated]
|
paulwalker-arm
approved these changes
Apr 10, 2025
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Reapplies #134068.
The first patch was missing a check to prevent attempts to pair SVE fill/spill with other Neon load/store instructions, which could happen specifically if the Neon instruction was unscaled.
I've also added a new test to check against possible attempts to pair SVE LDR/STR with Neon instructions.