TargetLowering: Allow FMINNUM/FMAXNUM to lower to FMINIMUM/FMAXIMUM even without nsz#177828
Conversation
|
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-backend-webassembly Author: None (valadaptive) ChangesThis restriction was originally added in https://reviews.llvm.org/D143256, with the given justification: > Currently, in TargetLowering, if the target does not support fminnum, we lower to fminimum if neither operand could be a NaN. But this isn't quite correct because fminnum and fminimum treat +/-0 differently; so, we need to prove that one of the operands isn't a zero. As far as I can tell, this was never correct. Before #172012, Patch is 63.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/177828.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5cdceb02897e7..d6162c8ce9d18 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8811,12 +8811,9 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
// If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that
// instead if there are no NaNs and there can't be an incompatible zero
// compare: at least one operand isn't +/-0, or there are no signed-zeros.
- if ((Node->getFlags().hasNoNaNs() ||
- (DAG.isKnownNeverNaN(Node->getOperand(0)) &&
- DAG.isKnownNeverNaN(Node->getOperand(1)))) &&
- (Node->getFlags().hasNoSignedZeros() ||
- DAG.isKnownNeverZeroFloat(Node->getOperand(0)) ||
- DAG.isKnownNeverZeroFloat(Node->getOperand(1)))) {
+ if (Node->getFlags().hasNoNaNs() ||
+ (DAG.isKnownNeverNaN(Node->getOperand(0)) &&
+ DAG.isKnownNeverNaN(Node->getOperand(1)))) {
unsigned IEEE2018Op =
Node->getOpcode() == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM;
if (isOperationLegalOrCustom(IEEE2018Op, VT))
diff --git a/llvm/test/CodeGen/ARM/lower-vmax.ll b/llvm/test/CodeGen/ARM/lower-vmax.ll
index 73f0f165fdc73..6dfb466047abb 100644
--- a/llvm/test/CodeGen/ARM/lower-vmax.ll
+++ b/llvm/test/CodeGen/ARM/lower-vmax.ll
@@ -5,16 +5,18 @@
define float @max_f32(float, float) {
; CHECK-NO_NEON-LABEL: max_f32:
; CHECK-NO_NEON: @ %bb.0:
-; CHECK-NO_NEON-NEXT: vcmp.f32 s1, s0
-; CHECK-NO_NEON-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-NO_NEON-NEXT: vmovgt.f32 s0, s1
+; CHECK-NO_NEON-NEXT: vmov.f32 s2, s1
+; CHECK-NO_NEON-NEXT: @ kill: def $s0 killed $s0 def $d0
+; CHECK-NO_NEON-NEXT: vmax.f32 d0, d1, d0
+; CHECK-NO_NEON-NEXT: @ kill: def $s0 killed $s0 killed $d0
; CHECK-NO_NEON-NEXT: mov pc, lr
;
; CHECK-NEON-LABEL: max_f32:
; CHECK-NEON: @ %bb.0:
-; CHECK-NEON-NEXT: vcmp.f32 s1, s0
-; CHECK-NEON-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-NEON-NEXT: vmovgt.f32 s0, s1
+; CHECK-NEON-NEXT: vmov.f32 s2, s1
+; CHECK-NEON-NEXT: @ kill: def $s0 killed $s0 def $d0
+; CHECK-NEON-NEXT: vmax.f32 d0, d1, d0
+; CHECK-NEON-NEXT: @ kill: def $s0 killed $s0 killed $d0
; CHECK-NEON-NEXT: mov pc, lr
%3 = call nnan float @llvm.maxnum.f32(float %1, float %0)
ret float %3
@@ -25,16 +27,18 @@ declare float @llvm.maxnum.f32(float, float) #1
define float @min_f32(float, float) {
; CHECK-NO_NEON-LABEL: min_f32:
; CHECK-NO_NEON: @ %bb.0:
-; CHECK-NO_NEON-NEXT: vcmp.f32 s1, s0
-; CHECK-NO_NEON-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-NO_NEON-NEXT: vmovlt.f32 s0, s1
+; CHECK-NO_NEON-NEXT: vmov.f32 s2, s1
+; CHECK-NO_NEON-NEXT: @ kill: def $s0 killed $s0 def $d0
+; CHECK-NO_NEON-NEXT: vmin.f32 d0, d1, d0
+; CHECK-NO_NEON-NEXT: @ kill: def $s0 killed $s0 killed $d0
; CHECK-NO_NEON-NEXT: mov pc, lr
;
; CHECK-NEON-LABEL: min_f32:
; CHECK-NEON: @ %bb.0:
-; CHECK-NEON-NEXT: vcmp.f32 s1, s0
-; CHECK-NEON-NEXT: vmrs APSR_nzcv, fpscr
-; CHECK-NEON-NEXT: vmovlt.f32 s0, s1
+; CHECK-NEON-NEXT: vmov.f32 s2, s1
+; CHECK-NEON-NEXT: @ kill: def $s0 killed $s0 def $d0
+; CHECK-NEON-NEXT: vmin.f32 d0, d1, d0
+; CHECK-NEON-NEXT: @ kill: def $s0 killed $s0 killed $d0
; CHECK-NEON-NEXT: mov pc, lr
%3 = call nnan float @llvm.minnum.f32(float %1, float %0)
ret float %3
diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
index 6706d25ae01d2..8564d7d9996d3 100644
--- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
@@ -18,9 +18,7 @@ define float @fminnum32_intrinsic(float %x, float %y) {
; ARMV7: @ %bb.0:
; ARMV7-NEXT: vmov s0, r1
; ARMV7-NEXT: vmov s2, r0
-; ARMV7-NEXT: vcmp.f32 s2, s0
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovlt.f32 s0, s2
+; ARMV7-NEXT: vmin.f32 d0, d1, d0
; ARMV7-NEXT: vmov r0, s0
; ARMV7-NEXT: bx lr
;
@@ -104,9 +102,7 @@ define float @fmaxnum32_intrinsic(float %x, float %y) {
; ARMV7: @ %bb.0:
; ARMV7-NEXT: vmov s0, r1
; ARMV7-NEXT: vmov s2, r0
-; ARMV7-NEXT: vcmp.f32 s2, s0
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovgt.f32 s0, s2
+; ARMV7-NEXT: vmax.f32 d0, d1, d0
; ARMV7-NEXT: vmov r0, s0
; ARMV7-NEXT: bx lr
;
@@ -160,12 +156,10 @@ define float @fmaxnum32_nsz_intrinsic(float %x, float %y) {
define float @fmaxnum32_zero_intrinsic(float %x) {
; ARMV7-LABEL: fmaxnum32_zero_intrinsic:
; ARMV7: @ %bb.0:
-; ARMV7-NEXT: vmov s0, r0
-; ARMV7-NEXT: vldr s2, .LCPI5_0
-; ARMV7-NEXT: vcmp.f32 s0, #0
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovgt.f32 s2, s0
-; ARMV7-NEXT: vmov r0, s2
+; ARMV7-NEXT: vldr s0, .LCPI5_0
+; ARMV7-NEXT: vmov s2, r0
+; ARMV7-NEXT: vmax.f32 d0, d1, d0
+; ARMV7-NEXT: vmov r0, s0
; ARMV7-NEXT: bx lr
; ARMV7-NEXT: .p2align 2
; ARMV7-NEXT: @ %bb.1:
@@ -490,24 +484,13 @@ define double @fmaxnum64_non_zero_intrinsic(double %x) {
define <4 x float> @fminnumv432_intrinsic(<4 x float> %x, <4 x float> %y) {
; ARMV7-LABEL: fminnumv432_intrinsic:
; ARMV7: @ %bb.0:
-; ARMV7-NEXT: mov r12, sp
-; ARMV7-NEXT: vld1.64 {d0, d1}, [r12]
-; ARMV7-NEXT: vmov d3, r2, r3
-; ARMV7-NEXT: vmov d2, r0, r1
-; ARMV7-NEXT: vcmp.f32 s6, s2
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s7, s3
-; ARMV7-NEXT: vmovlt.f32 s2, s6
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s5, s1
-; ARMV7-NEXT: vmovlt.f32 s3, s7
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s4, s0
-; ARMV7-NEXT: vmovlt.f32 s1, s5
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovlt.f32 s0, s4
-; ARMV7-NEXT: vmov r2, r3, d1
-; ARMV7-NEXT: vmov r0, r1, d0
+; ARMV7-NEXT: vmov d17, r2, r3
+; ARMV7-NEXT: vmov d16, r0, r1
+; ARMV7-NEXT: mov r0, sp
+; ARMV7-NEXT: vld1.64 {d18, d19}, [r0]
+; ARMV7-NEXT: vmin.f32 q8, q8, q9
+; ARMV7-NEXT: vmov r0, r1, d16
+; ARMV7-NEXT: vmov r2, r3, d17
; ARMV7-NEXT: bx lr
;
; ARMV8-LABEL: fminnumv432_intrinsic:
@@ -609,26 +592,21 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
; ARMV7-LABEL: fminnumv432_one_zero_intrinsic:
; ARMV7: @ %bb.0:
-; ARMV7-NEXT: vmov d1, r2, r3
-; ARMV7-NEXT: vldr s4, .LCPI18_0
-; ARMV7-NEXT: vmov d0, r0, r1
-; ARMV7-NEXT: vmov.f32 s6, #-1.000000e+00
-; ARMV7-NEXT: vcmp.f32 s1, #0
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmov.f32 s8, s3
-; ARMV7-NEXT: vmin.f32 d7, d1, d3
-; ARMV7-NEXT: vmin.f32 d6, d0, d3
-; ARMV7-NEXT: vmin.f32 d4, d4, d3
-; ARMV7-NEXT: vmovlt.f32 s4, s1
-; ARMV7-NEXT: vmov.f32 s13, s4
-; ARMV7-NEXT: vmov.f32 s15, s8
-; ARMV7-NEXT: vmov r0, r1, d6
-; ARMV7-NEXT: vmov r2, r3, d7
+; ARMV7-NEXT: vmov d17, r2, r3
+; ARMV7-NEXT: vmov d16, r0, r1
+; ARMV7-NEXT: adr r0, .LCPI18_0
+; ARMV7-NEXT: vld1.64 {d18, d19}, [r0:128]
+; ARMV7-NEXT: vmin.f32 q8, q8, q9
+; ARMV7-NEXT: vmov r0, r1, d16
+; ARMV7-NEXT: vmov r2, r3, d17
; ARMV7-NEXT: bx lr
-; ARMV7-NEXT: .p2align 2
+; ARMV7-NEXT: .p2align 4
; ARMV7-NEXT: @ %bb.1:
; ARMV7-NEXT: .LCPI18_0:
+; ARMV7-NEXT: .long 0xbf800000 @ float -1
; ARMV7-NEXT: .long 0x00000000 @ float 0
+; ARMV7-NEXT: .long 0xbf800000 @ float -1
+; ARMV7-NEXT: .long 0xbf800000 @ float -1
;
; ARMV8-LABEL: fminnumv432_one_zero_intrinsic:
; ARMV8: @ %bb.0:
@@ -672,24 +650,13 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
define <4 x float> @fmaxnumv432_intrinsic(<4 x float> %x, <4 x float> %y) {
; ARMV7-LABEL: fmaxnumv432_intrinsic:
; ARMV7: @ %bb.0:
-; ARMV7-NEXT: mov r12, sp
-; ARMV7-NEXT: vld1.64 {d0, d1}, [r12]
-; ARMV7-NEXT: vmov d3, r2, r3
-; ARMV7-NEXT: vmov d2, r0, r1
-; ARMV7-NEXT: vcmp.f32 s6, s2
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s7, s3
-; ARMV7-NEXT: vmovgt.f32 s2, s6
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s5, s1
-; ARMV7-NEXT: vmovgt.f32 s3, s7
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s4, s0
-; ARMV7-NEXT: vmovgt.f32 s1, s5
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovgt.f32 s0, s4
-; ARMV7-NEXT: vmov r2, r3, d1
-; ARMV7-NEXT: vmov r0, r1, d0
+; ARMV7-NEXT: vmov d17, r2, r3
+; ARMV7-NEXT: vmov d16, r0, r1
+; ARMV7-NEXT: mov r0, sp
+; ARMV7-NEXT: vld1.64 {d18, d19}, [r0]
+; ARMV7-NEXT: vmax.f32 q8, q8, q9
+; ARMV7-NEXT: vmov r0, r1, d16
+; ARMV7-NEXT: vmov r2, r3, d17
; ARMV7-NEXT: bx lr
;
; ARMV8-LABEL: fmaxnumv432_intrinsic:
@@ -757,31 +724,13 @@ define <4 x float> @fmaxnumv432_nsz_intrinsic(<4 x float> %x, <4 x float> %y) {
define <4 x float> @fmaxnumv432_zero_intrinsic(<4 x float> %x) {
; ARMV7-LABEL: fmaxnumv432_zero_intrinsic:
; ARMV7: @ %bb.0:
-; ARMV7-NEXT: vmov d3, r2, r3
-; ARMV7-NEXT: vldr s0, .LCPI21_0
-; ARMV7-NEXT: vmov d2, r0, r1
-; ARMV7-NEXT: vcmp.f32 s6, #0
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmov.f32 s2, s0
-; ARMV7-NEXT: vcmp.f32 s7, #0
-; ARMV7-NEXT: vmovgt.f32 s2, s6
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmov.f32 s3, s0
-; ARMV7-NEXT: vcmp.f32 s5, #0
-; ARMV7-NEXT: vmovgt.f32 s3, s7
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmov.f32 s1, s0
-; ARMV7-NEXT: vcmp.f32 s4, #0
-; ARMV7-NEXT: vmovgt.f32 s1, s5
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovgt.f32 s0, s4
-; ARMV7-NEXT: vmov r2, r3, d1
-; ARMV7-NEXT: vmov r0, r1, d0
+; ARMV7-NEXT: vmov d19, r2, r3
+; ARMV7-NEXT: vmov.i32 q8, #0x0
+; ARMV7-NEXT: vmov d18, r0, r1
+; ARMV7-NEXT: vmax.f32 q8, q9, q8
+; ARMV7-NEXT: vmov r0, r1, d16
+; ARMV7-NEXT: vmov r2, r3, d17
; ARMV7-NEXT: bx lr
-; ARMV7-NEXT: .p2align 2
-; ARMV7-NEXT: @ %bb.1:
-; ARMV7-NEXT: .LCPI21_0:
-; ARMV7-NEXT: .long 0x00000000 @ float 0
;
; ARMV8-LABEL: fmaxnumv432_zero_intrinsic:
; ARMV8: @ %bb.0:
@@ -809,31 +758,13 @@ define <4 x float> @fmaxnumv432_zero_intrinsic(<4 x float> %x) {
define <4 x float> @fmaxnumv432_minus_zero_intrinsic(<4 x float> %x) {
; ARMV7-LABEL: fmaxnumv432_minus_zero_intrinsic:
; ARMV7: @ %bb.0:
-; ARMV7-NEXT: vldr s0, .LCPI22_0
-; ARMV7-NEXT: vmov d3, r2, r3
-; ARMV7-NEXT: vmov d2, r0, r1
-; ARMV7-NEXT: vcmp.f32 s6, s0
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s7, s0
-; ARMV7-NEXT: vmov.f32 s2, s0
-; ARMV7-NEXT: vmovgt.f32 s2, s6
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s5, s0
-; ARMV7-NEXT: vmov.f32 s3, s0
-; ARMV7-NEXT: vmovgt.f32 s3, s7
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vcmp.f32 s4, s0
-; ARMV7-NEXT: vmov.f32 s1, s0
-; ARMV7-NEXT: vmovgt.f32 s1, s5
-; ARMV7-NEXT: vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT: vmovgt.f32 s0, s4
-; ARMV7-NEXT: vmov r2, r3, d1
-; ARMV7-NEXT: vmov r0, r1, d0
+; ARMV7-NEXT: vmov d19, r2, r3
+; ARMV7-NEXT: vmov.i32 q8, #0x80000000
+; ARMV7-NEXT: vmov d18, r0, r1
+; ARMV7-NEXT: vmax.f32 q8, q9, q8
+; ARMV7-NEXT: vmov r0, r1, d16
+; ARMV7-NEXT: vmov r2, r3, d17
; ARMV7-NEXT: bx lr
-; ARMV7-NEXT: .p2align 2
-; ARMV7-NEXT: @ %bb.1:
-; ARMV7-NEXT: .LCPI22_0:
-; ARMV7-NEXT: .long 0x80000000 @ float -0
;
; ARMV8-LABEL: fmaxnumv432_minus_zero_intrinsic:
; ARMV8: @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/vminmax.ll b/llvm/test/CodeGen/ARM/vminmax.ll
index dc4a6ac2a79b0..b026e2956f87c 100644
--- a/llvm/test/CodeGen/ARM/vminmax.ll
+++ b/llvm/test/CodeGen/ARM/vminmax.ll
@@ -1,129 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
define <8 x i8> @vmins8(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vmins8:
-;CHECK: vmin.s8
- %tmp1 = load <8 x i8>, ptr %A
- %tmp2 = load <8 x i8>, ptr %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
- ret <8 x i8> %tmp3
+; CHECK-LABEL: vmins8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.s8 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
}
define <4 x i16> @vmins16(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vmins16:
-;CHECK: vmin.s16
- %tmp1 = load <4 x i16>, ptr %A
- %tmp2 = load <4 x i16>, ptr %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
- ret <4 x i16> %tmp3
+; CHECK-LABEL: vmins16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.s16 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <4 x i16>, ptr %A
+ %tmp2 = load <4 x i16>, ptr %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
}
define <2 x i32> @vmins32(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vmins32:
-;CHECK: vmin.s32
- %tmp1 = load <2 x i32>, ptr %A
- %tmp2 = load <2 x i32>, ptr %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
- ret <2 x i32> %tmp3
+; CHECK-LABEL: vmins32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.s32 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <2 x i32>, ptr %A
+ %tmp2 = load <2 x i32>, ptr %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
}
define <8 x i8> @vminu8(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminu8:
-;CHECK: vmin.u8
- %tmp1 = load <8 x i8>, ptr %A
- %tmp2 = load <8 x i8>, ptr %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
- ret <8 x i8> %tmp3
+; CHECK-LABEL: vminu8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.u8 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
}
define <4 x i16> @vminu16(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminu16:
-;CHECK: vmin.u16
- %tmp1 = load <4 x i16>, ptr %A
- %tmp2 = load <4 x i16>, ptr %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
- ret <4 x i16> %tmp3
+; CHECK-LABEL: vminu16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.u16 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <4 x i16>, ptr %A
+ %tmp2 = load <4 x i16>, ptr %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
}
define <2 x i32> @vminu32(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminu32:
-;CHECK: vmin.u32
- %tmp1 = load <2 x i32>, ptr %A
- %tmp2 = load <2 x i32>, ptr %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
- ret <2 x i32> %tmp3
+; CHECK-LABEL: vminu32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.u32 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <2 x i32>, ptr %A
+ %tmp2 = load <2 x i32>, ptr %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
}
define <2 x float> @vminf32(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminf32:
-;CHECK: vmin.f32
- %tmp1 = load <2 x float>, ptr %A
- %tmp2 = load <2 x float>, ptr %B
- %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
- ret <2 x float> %tmp3
+; CHECK-LABEL: vminf32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vmin.f32 d16, d17, d16
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <2 x float>, ptr %A
+ %tmp2 = load <2 x float>, ptr %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
}
define <16 x i8> @vminQs8(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminQs8:
-;CHECK: vmin.s8
- %tmp1 = load <16 x i8>, ptr %A
- %tmp2 = load <16 x i8>, ptr %B
- %tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
- ret <16 x i8> %tmp3
+; CHECK-LABEL: vminQs8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vmin.s8 q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <16 x i8>, ptr %A
+ %tmp2 = load <16 x i8>, ptr %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
}
define <8 x i16> @vminQs16(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminQs16:
-;CHECK: vmin.s16
- %tmp1 = load <8 x i16>, ptr %A
- %tmp2 = load <8 x i16>, ptr %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
- ret <8 x i16> %tmp3
+; CHECK-LABEL: vminQs16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vmin.s16 q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <8 x i16>, ptr %A
+ %tmp2 = load <8 x i16>, ptr %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
}
define <4 x i32> @vminQs32(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminQs32:
-;CHECK: vmin.s32
- %tmp1 = load <4 x i32>, ptr %A
- %tmp2 = load <4 x i32>, ptr %B
- %tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
- ret <4 x i32> %tmp3
+; CHECK-LABEL: vminQs32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vmin.s32 q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <4 x i32>, ptr %A
+ %tmp2 = load <4 x i32>, ptr %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
}
define <16 x i8> @vminQu8(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminQu8:
-;CHECK: vmin.u8
- %tmp1 = load <16 x i8>, ptr %A
- %tmp2 = load <16 x i8>, ptr %B
- %tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
- ret <16 x i8> %tmp3
+; CHECK-LABEL: vminQu8:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vmin.u8 q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <16 x i8>, ptr %A
+ %tmp2 = load <16 x i8>, ptr %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
}
define <8 x i16> @vminQu16(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: vminQu16:
-;CHECK: vmin.u16
- %tmp1 = load <8 x i16>, ptr %A
- %tmp2 = load <8 x i16>, ptr %B
- %tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
- ret <8 x i16> %tmp3
+; CHECK-LABEL: vminQu16:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vmin.u16 q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: mov pc, lr
+ %tmp1 = load <8 x i16>, ptr %A
+ %tmp2 = load <8 x i16>, ptr %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
}
define <4 x i32> @vminQu32(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL...
[truncated]
|
…ven without `nsz` (llvm#177828) This restriction was originally added in https://reviews.llvm.org/D143256, with the given justification: > Currently, in TargetLowering, if the target does not support fminnum, we lower to fminimum if neither operand could be a NaN. But this isn't quite correct because fminnum and fminimum treat +/-0 differently; so, we need to prove that one of the operands isn't a zero. As far as I can tell, this was never correct. Before llvm#172012, `minnum` and `maxnum` were nondeterministic with regards to signed zero, so it's always been perfectly legal to lower them to operations that order signed zeroes.
…ven without `nsz` (llvm#177828) This restriction was originally added in https://reviews.llvm.org/D143256, with the given justification: > Currently, in TargetLowering, if the target does not support fminnum, we lower to fminimum if neither operand could be a NaN. But this isn't quite correct because fminnum and fminimum treat +/-0 differently; so, we need to prove that one of the operands isn't a zero. As far as I can tell, this was never correct. Before llvm#172012, `minnum` and `maxnum` were nondeterministic with regards to signed zero, so it's always been perfectly legal to lower them to operations that order signed zeroes.
This restriction was originally added in https://reviews.llvm.org/D143256, with the given justification:
As far as I can tell, this was never correct. Before #172012,
minnumandmaxnumwere nondeterministic with regards to signed zero, so it's always been perfectly legal to lower them to operations that order signed zeroes.