Aarch64 codegen: represent bool true as -1, not 1.

cfallin · cfallin · commit b8f6d53a6bdf · 2020-07-22T12:30:55.000-07:00
It seems that this is actually the correct behavior for bool types wider than `b1`; some of the vector instruction optimizations depend on bool lanes representing false and true as all-zeroes and all-ones respectively. For `b8`..`b64`, this results in an extra negation after a `cset` when a bool is produced by an `icmp`/`fcmp`, but the most common case (`b1`) is unaffected, because an all-ones one-bit value is just `1`. An example of this assumption can be seen here: https://github.com/bytecodealliance/wasmtime/blob/399ee0a54c3be0f89ae07a0af5104ba929a9eba4/cranelift/codegen/src/simple_preopt.rs#L956 Thanks to Joey Gouly of ARM for noting this issue while implementing SIMD support, and digging into the source (finding the above example) to determine the correct behavior.
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1380,13 +1380,6 @@ impl MachInstEmit for Inst {
             &Inst::MovFromNZCV { rd } => {
                 sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg()));
             }
-            &Inst::CondSet { rd, cond } => {
-                sink.put4(
-                    0b100_11010100_11111_0000_01_11111_00000
-                        | (cond.invert().bits() << 12)
-                        | machreg_to_gpr(rd.to_reg()),
-                );
-            }
             &Inst::Extend {
                 rd,
                 rn,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -1888,14 +1888,6 @@ fn test_aarch64_binemit() {
         "1B423BD5",
         "mrs x27, nzcv",
     ));
-    insns.push((
-        Inst::CondSet {
-            rd: writable_xreg(5),
-            cond: Cond::Hi,
-        },
-        "E5979F9A",
-        "cset x5, hi",
-    ));
     insns.push((
         Inst::VecDup {
             rd: writable_vreg(25),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -809,12 +809,6 @@ pub enum Inst {
         rd: Writable<Reg>,
     },
 
-    /// Set a register to 1 if condition, else 0.
-    CondSet {
-        rd: Writable<Reg>,
-        cond: Cond,
-    },
-
     /// A machine call instruction. N.B.: this allows only a +/- 128MB offset (it uses a relocation
     /// of type `Reloc::Arm64Call`); if the destination distance is not `RelocDistance::Near`, the
     /// code should use a `LoadExtName` / `CallInd` sequence instead, allowing an arbitrary 64-bit
@@ -1358,9 +1352,6 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::MovFromNZCV { rd } => {
             collector.add_def(rd);
         }
-        &Inst::CondSet { rd, .. } => {
-            collector.add_def(rd);
-        }
         &Inst::Extend { rd, rn, .. } => {
             collector.add_def(rd);
             collector.add_use(rn);
@@ -1954,9 +1945,6 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
         &mut Inst::MovFromNZCV { ref mut rd } => {
             map_def(mapper, rd);
         }
-        &mut Inst::CondSet { ref mut rd, .. } => {
-            map_def(mapper, rd);
-        }
         &mut Inst::Extend {
             ref mut rd,
             ref mut rn,
@@ -2830,11 +2818,6 @@ impl Inst {
                 let rd = rd.to_reg().show_rru(mb_rru);
                 format!("mrs {}, nzcv", rd)
             }
-            &Inst::CondSet { rd, cond } => {
-                let rd = rd.to_reg().show_rru(mb_rru);
-                let cond = cond.show_rru(mb_rru);
-                format!("cset {}, {}", rd, cond)
-            }
             &Inst::Extend {
                 rd,
                 rn,
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1014,6 +1014,24 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
     }
 }
 
+/// Convert a 0 / 1 result, such as from a conditional-set instruction, into a 0
+/// / -1 (all-ones) result as expected for bool operations.
+pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    rd: Writable<Reg>,
+) {
+    // A boolean is 0 / -1; if output width is > 1, negate.
+    if ty_bits(ctx.output_ty(insn, 0)) > 1 {
+        ctx.emit(Inst::AluRRR {
+            alu_op: ALUOp::Sub64,
+            rd,
+            rn: zero_reg(),
+            rm: rd.to_reg(),
+        });
+    }
+}
+
 //=============================================================================
 // Lowering-backend trait implementation.
 
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1208,6 +1208,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
             let rd = get_output_reg(ctx, outputs[0]);
             ctx.emit(Inst::CSet { rd, cond });
+            normalize_bool_result(ctx, insn, rd);
         }
 
         Opcode::Trueff => {
@@ -1217,6 +1218,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
             let rd = get_output_reg(ctx, outputs[0]);
             ctx.emit(Inst::CSet { rd, cond });
+            normalize_bool_result(ctx, insn, rd);
         }
 
         Opcode::IsNull | Opcode::IsInvalid => {
@@ -1240,6 +1242,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
             ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
             ctx.emit(Inst::CSet { rd, cond: Cond::Eq });
+            normalize_bool_result(ctx, insn, rd);
         }
 
         Opcode::Copy => {
@@ -1249,49 +1252,95 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(Inst::gen_move(rd, rn, ty));
         }
 
-        Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => {
-            // If this is a Bint from a Trueif/Trueff/IsNull/IsInvalid, then the result is already
-            // 64-bit-zero-extended, even if the CLIF type doesn't say so, because it was produced
-            // by a CSet. In this case, we do not need to do any zero-extension.
-            let input_info = ctx.get_input(insn, 0);
-            let src_op = input_info
-                .inst
-                .map(|(src_inst, _)| ctx.data(src_inst).opcode());
-            let narrow_mode = match (src_op, op) {
-                (Some(Opcode::Trueif), Opcode::Bint)
-                | (Some(Opcode::Trueff), Opcode::Bint)
-                | (Some(Opcode::IsNull), Opcode::Bint)
-                | (Some(Opcode::IsInvalid), Opcode::Bint) => NarrowValueMode::None,
-                _ => NarrowValueMode::ZeroExtend64,
-            };
-
-            // All of these ops are simply a move from a zero-extended source.
-            // Here is why this works, in each case:
-            //
-            // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we
-            //   merely need to zero-extend here.
-            //
-            // - Breduce, Bextend: changing width of a boolean. We represent a
-            //   bool as a 0 or 1, so again, this is a zero-extend / no-op.
-            //
-            // - Ireduce: changing width of an integer. Smaller ints are stored
-            //   with undefined high-order bits, so we can simply do a copy.
-
-            let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+        Opcode::Breduce | Opcode::Ireduce => {
+            // Smaller integers/booleans are stored with high-order bits
+            // undefined, so we can simply do a copy.
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]);
             let ty = ctx.input_ty(insn, 0);
             ctx.emit(Inst::gen_move(rd, rn, ty));
         }
 
-        Opcode::Bmask => {
-            // Bool is {0, 1}, so we can subtract from 0 to get all-1s.
+        Opcode::Bextend | Opcode::Bmask => {
+            // Bextend and Bmask both simply sign-extend. This works for:
+            // - Bextend, because booleans are stored as 0 / -1, so we
+            //   sign-extend the -1 to a -1 in the wider width.
+            // - Bmask, because the resulting integer mask value must be
+            //   all-ones (-1) if the argument is true.
+            //
+            // For a sign-extension from a 1-bit value (Case 1 below), we need
+            // to do things a bit specially, because the ISA does not have a
+            // 1-to-N-bit sign extension instruction.  For 8-bit or wider
+            // sources (Case 2 below), we do a sign extension normally.
+
+            let from_ty = ctx.input_ty(insn, 0);
+            let to_ty = ctx.output_ty(insn, 0);
+            let from_bits = ty_bits(from_ty);
+            let to_bits = ty_bits(to_ty);
+
+            assert!(
+                from_bits <= 64 && to_bits <= 64,
+                "Vector Bextend not supported yet"
+            );
+            assert!(from_bits <= to_bits);
+
+            if from_bits == to_bits {
+                // Nothing.
+            } else if from_bits == 1 {
+                assert!(to_bits >= 8);
+                // Case 1: 1-bit to N-bit extension: AND the LSB of source into
+                // dest, generating a value of 0 or 1, then negate to get
+                // 0x000... or 0xfff...
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = get_output_reg(ctx, outputs[0]);
+                // AND Rdest, Rsource, #1
+                ctx.emit(Inst::AluRRImmLogic {
+                    alu_op: ALUOp::And64,
+                    rd,
+                    rn,
+                    imml: ImmLogic::maybe_from_u64(1, I64).unwrap(),
+                });
+                // SUB Rdest, XZR, Rdest  (i.e., NEG Rdest)
+                ctx.emit(Inst::AluRRR {
+                    alu_op: ALUOp::Sub64,
+                    rd,
+                    rn: zero_reg(),
+                    rm: rd.to_reg(),
+                });
+            } else {
+                // Case 2: 8-or-more-bit to N-bit extension: just sign-extend. A
+                // `true` (all ones, or `-1`) will be extended to -1 with the
+                // larger width.
+                assert!(from_bits >= 8);
+                let narrow_mode = if to_bits == 64 {
+                    NarrowValueMode::SignExtend64
+                } else {
+                    assert!(to_bits <= 32);
+                    NarrowValueMode::SignExtend32
+                };
+                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
+                let rd = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::gen_move(rd, rn, to_ty));
+            }
+        }
+
+        Opcode::Bint => {
+            // Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
+            // out the LSB to give a 0 / 1-valued integer result.
+            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
             let rd = get_output_reg(ctx, outputs[0]);
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
-            ctx.emit(Inst::AluRRR {
-                alu_op: ALUOp::Sub64,
+            let output_bits = ty_bits(ctx.output_ty(insn, 0));
+
+            let (imm_ty, alu_op) = if output_bits > 32 {
+                (I64, ALUOp::And64)
+            } else {
+                (I32, ALUOp::And32)
+            };
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op,
                 rd,
-                rn: zero_reg(),
-                rm,
+                rn,
+                imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
             });
         }
 
@@ -1369,7 +1418,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
                 let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
                 ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
-                ctx.emit(Inst::CondSet { cond, rd });
+                ctx.emit(Inst::CSet { cond, rd });
+                normalize_bool_result(ctx, insn, rd);
             } else {
                 let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
@@ -1394,7 +1444,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     }
                     _ => panic!("Bad float size"),
                 }
-                ctx.emit(Inst::CondSet { cond, rd });
+                ctx.emit(Inst::CSet { cond, rd });
+                normalize_bool_result(ctx, insn, rd);
             } else {
                 lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
             }
@@ -1659,6 +1710,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             });
 
             ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
+            normalize_bool_result(ctx, insn, rd);
         }
 
         Opcode::Shuffle