diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 842655d0ca0e9..8e21a93799772 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20771,6 +20771,103 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) { CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp); } +// Attempt to use REVs for half-rotations of vectors of i16, i32 and i64. +// Patterns for i32: +// +// (OR (SHL_PRED Pg, X, (splat 16)), +// (SRL_PRED Pg, X, (splat 16))) +// => +// REVH Pg, X, poison +// +// (OR (VSHL X, 16), (VLSHR X, 16)) +// => +// NVCAST (REV32 X) +static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N->getOpcode() == ISD::OR && "Expected OR instruction"); + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + unsigned EltSize = VT.getScalarSizeInBits(); + if (EltSize != 16 && EltSize != 32 && EltSize != 64) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (VT.isScalableVector()) { + if (N0.getOpcode() == AArch64ISD::SRL_PRED) + std::swap(N0, N1); + if (N0.getOpcode() != AArch64ISD::SHL_PRED || + N1.getOpcode() != AArch64ISD::SRL_PRED) + return SDValue(); + + // Ensure we have common inputs. + if (N0.getOperand(0) != N1.getOperand(0) || + N0.getOperand(1) != N1.getOperand(1) || + N0.getOperand(2) != N1.getOperand(2)) + return SDValue(); + + APInt ShAmt; + if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShAmt) || + EltSize / 2 != ShAmt) + return SDValue(); + + unsigned RevOp; + if (EltSize == 16) + RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU; + else if (EltSize == 32) + RevOp = AArch64ISD::REVH_MERGE_PASSTHRU; + else /* EltSize == 64 */ + RevOp = AArch64ISD::REVW_MERGE_PASSTHRU; + + return DAG.getNode(RevOp, DL, VT, N0.getOperand(0), N0.getOperand(1), + DAG.getPOISON(VT)); + } + + assert(VT.isFixedLengthVector() && "Expected fixed length vector type"); + + // Half rotations of i16 vectors should be combined to bswap, so we shouldn't + // need custom code for them here. + // Note: This doesn't apply to scalable vectors as we allow arbitrary (but + // matching) predicates in the shifts. Predicated rotations aren't matched to + // rotl / rotr, and subsequently aren't combined to bswap. + if (EltSize == 16) + return SDValue(); + + if (N0.getOpcode() == AArch64ISD::VLSHR) + std::swap(N0, N1); + if (N0.getOpcode() != AArch64ISD::VSHL || N1.getOpcode() != AArch64ISD::VLSHR) + return SDValue(); + + // Ensure common inputs. + if (N0.getOperand(0) != N1.getOperand(0) || + N0.getOperand(1) != N1.getOperand(1)) + return SDValue(); + + if (EltSize / 2 != N0.getConstantOperandVal(1)) + return SDValue(); + + EVT HalfVT; + unsigned RevOp; + if (EltSize == 32) { + RevOp = AArch64ISD::REV32; + HalfVT = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + } else /* EltSize == 64 */ { + RevOp = AArch64ISD::REV64; + HalfVT = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; + } + + return DAG.getNode(AArch64ISD::NVCAST, DL, VT, + DAG.getNode(RevOp, DL, HalfVT, N0->getOperand(0))); +} + static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI) { @@ -20779,6 +20876,9 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, if (SDValue R = performANDORCSELCombine(N, DAG)) return R; + if (SDValue R = tryCombineToREV(N, DAG, DCI)) + return R; + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll index 8f6f4510d8388..e72745e551dbb 100644 --- a/llvm/test/CodeGen/AArch64/sve2-xar.ll +++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll @@ -296,6 +296,73 @@ define @xar_nxv2i64_shifts_neg( %x, %or } +; Don't use XAR if REV[BHW] can be used. + +define @revb_nxv8i16( %r) { +; CHECK-LABEL: revb_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: revb z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %or = tail call @llvm.fshl( %r, %r, splat (i16 8)) + ret %or +} + +define @revh_nxv4i32( %r) { +; CHECK-LABEL: revh_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: revh z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %or = tail call @llvm.fshl( %r, %r, splat (i32 16)) + ret %or +} + +define @revw_nx2i64( %r) { +; CHECK-LABEL: revw_nx2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %or = tail call @llvm.fshl( %r, %r, splat (i64 32)) + ret %or +} + +; As above, one test with rotate right. +define @revw_nx2i64_r( %a) { +; CHECK-LABEL: revw_nx2i64_r: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %r = tail call @llvm.fshr( %a, %a, splat (i64 32)) + ret %r +} + +; As above, one test with predicated shifts instead of rotate left. +define @revh_nx4i32_shifts_l( %pg, %a) { +; CHECK-LABEL: revh_nx4i32_shifts_l: +; CHECK: // %bb.0: +; CHECK-NEXT: revh z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %shl = tail call @llvm.aarch64.sve.lsl.u( %pg, %a, splat (i32 16)) + %shr = tail call @llvm.aarch64.sve.lsr.u( %pg, %a, splat (i32 16)) + %or = or %shl, %shr + ret %or +} + +; As above, one test with predicated shifts instead of rotate right. +define @revb_nx8i16_shifts_r( %pg, %a) { +; CHECK-LABEL: revb_nx8i16_shifts_r: +; CHECK: // %bb.0: +; CHECK-NEXT: revb z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %shr = tail call @llvm.aarch64.sve.lsr.u( %pg, %a, splat (i16 8)) + %shl = tail call @llvm.aarch64.sve.lsl.u( %pg, %a, splat (i16 8)) + %or = or %shr, %shl + ret %or +} + declare @llvm.fshl.nxv2i64(, , ) declare @llvm.fshl.nxv4i32(, , ) declare @llvm.fshl.nxv8i16(, , ) diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll index 652617b58eaf3..4f940a39dfe39 100644 --- a/llvm/test/CodeGen/AArch64/xar.ll +++ b/llvm/test/CodeGen/AArch64/xar.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s -; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s -; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s +; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=CHECK,SHA3 %s +; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=CHECK,NOSHA3 %s +; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefixes=CHECK,SVE2 %s /* 128-bit vectors */ @@ -359,6 +359,96 @@ entry: ret <8 x i8> %or } +; Don't use XAR if REV16/REV32/REV64 can be used. + +define <4 x i16> @rev16_v4i16(<4 x i16> %r) { +; CHECK-LABEL: rev16_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.8b, v0.8b +; CHECK-NEXT: ret + %or = tail call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 8)) + ret <4 x i16> %or +} + +define <2 x i32> @rev32_v2i32(<2 x i32> %r) { +; CHECK-LABEL: rev32_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rev32 v0.4h, v0.4h +; CHECK-NEXT: ret + %or = tail call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 16)) + ret <2 x i32> %or +} + +define <1 x i64> @rev64_v1i64(<1 x i64> %r) { +; CHECK-LABEL: rev64_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v0.2s, v0.2s +; CHECK-NEXT: ret + %or = tail call <1 x i64> @llvm.fshl(<1 x i64> %r, <1 x i64> %r, <1 x i64> splat (i64 32)) + ret <1 x i64> %or +} + +define <8 x i16> @rev16_v8i16(<8 x i16> %r) { +; CHECK-LABEL: rev16_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.16b, v0.16b +; CHECK-NEXT: ret + %or = tail call <8 x i16> @llvm.fshl(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 8)) + ret <8 x i16> %or +} + +define <4 x i32> @rev32_v4i32(<4 x i32> %r) { +; CHECK-LABEL: rev32_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rev32 v0.8h, v0.8h +; CHECK-NEXT: ret + %or = tail call <4 x i32> @llvm.fshl(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 16)) + ret <4 x i32> %or +} + +define <2 x i64> @rev64_v2i64(<2 x i64> %r) { +; CHECK-LABEL: rev64_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v0.4s, v0.4s +; CHECK-NEXT: ret + %or = tail call <2 x i64> @llvm.fshl(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 32)) + ret <2 x i64> %or +} + +; As above, one test with rotate right. +define <1 x i64> @rev64_v1i64_r(<1 x i64> %a) { +; CHECK-LABEL: rev64_v1i64_r: +; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v0.2s, v0.2s +; CHECK-NEXT: ret + %r = tail call <1 x i64> @llvm.fshr(<1 x i64> %a, <1 x i64> %a, <1 x i64> splat (i64 32)) + ret <1 x i64> %r +} + +; As above, one test with individual shifts instead of rotate left. +define <2 x i32> @rev32_v2i32_shifts_l(<2 x i32> %a) { +; CHECK-LABEL: rev32_v2i32_shifts_l: +; CHECK: // %bb.0: +; CHECK-NEXT: rev32 v0.4h, v0.4h +; CHECK-NEXT: ret + %shl = shl <2 x i32> %a, splat (i32 16) + %shr = lshr <2 x i32> %a, splat (i32 16) + %or = or <2 x i32> %shl, %shr + ret <2 x i32> %or +} + +; As above, one test with individual shifts instead of rotate right. +define <4 x i16> @rev16_v4i16_shifts_r(<4 x i16> %a) { +; CHECK-LABEL: rev16_v4i16_shifts_r: +; CHECK: // %bb.0: +; CHECK-NEXT: rev16 v0.8b, v0.8b +; CHECK-NEXT: ret + %shr = lshr <4 x i16> %a, splat (i16 8) + %shl = shl <4 x i16> %a, splat (i16 8) + %or = or <4 x i16> %shr, %shl + ret <4 x i16> %or +} + declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)