Skip to content

Commit b8f6d53

Browse files
committed
Aarch64 codegen: represent bool true as -1, not 1.
It seems that this is actually the correct behavior for bool types wider than `b1`; some of the vector instruction optimizations depend on bool lanes representing false and true as all-zeroes and all-ones respectively. For `b8`..`b64`, this results in an extra negation after a `cset` when a bool is produced by an `icmp`/`fcmp`, but the most common case (`b1`) is unaffected, because an all-ones one-bit value is just `1`. An example of this assumption can be seen here: https://github.com/bytecodealliance/wasmtime/blob/399ee0a54c3be0f89ae07a0af5104ba929a9eba4/cranelift/codegen/src/simple_preopt.rs#L956 Thanks to Joey Gouly of ARM for noting this issue while implementing SIMD support, and digging into the source (finding the above example) to determine the correct behavior.
1 parent c420f65 commit b8f6d53

5 files changed

Lines changed: 108 additions & 70 deletions

File tree

cranelift/codegen/src/isa/aarch64/inst/emit.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,13 +1380,6 @@ impl MachInstEmit for Inst {
13801380
&Inst::MovFromNZCV { rd } => {
13811381
sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg()));
13821382
}
1383-
&Inst::CondSet { rd, cond } => {
1384-
sink.put4(
1385-
0b100_11010100_11111_0000_01_11111_00000
1386-
| (cond.invert().bits() << 12)
1387-
| machreg_to_gpr(rd.to_reg()),
1388-
);
1389-
}
13901383
&Inst::Extend {
13911384
rd,
13921385
rn,

cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1888,14 +1888,6 @@ fn test_aarch64_binemit() {
18881888
"1B423BD5",
18891889
"mrs x27, nzcv",
18901890
));
1891-
insns.push((
1892-
Inst::CondSet {
1893-
rd: writable_xreg(5),
1894-
cond: Cond::Hi,
1895-
},
1896-
"E5979F9A",
1897-
"cset x5, hi",
1898-
));
18991891
insns.push((
19001892
Inst::VecDup {
19011893
rd: writable_vreg(25),

cranelift/codegen/src/isa/aarch64/inst/mod.rs

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -809,12 +809,6 @@ pub enum Inst {
809809
rd: Writable<Reg>,
810810
},
811811

812-
/// Set a register to 1 if condition, else 0.
813-
CondSet {
814-
rd: Writable<Reg>,
815-
cond: Cond,
816-
},
817-
818812
/// A machine call instruction. N.B.: this allows only a +/- 128MB offset (it uses a relocation
819813
/// of type `Reloc::Arm64Call`); if the destination distance is not `RelocDistance::Near`, the
820814
/// code should use a `LoadExtName` / `CallInd` sequence instead, allowing an arbitrary 64-bit
@@ -1358,9 +1352,6 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
13581352
&Inst::MovFromNZCV { rd } => {
13591353
collector.add_def(rd);
13601354
}
1361-
&Inst::CondSet { rd, .. } => {
1362-
collector.add_def(rd);
1363-
}
13641355
&Inst::Extend { rd, rn, .. } => {
13651356
collector.add_def(rd);
13661357
collector.add_use(rn);
@@ -1954,9 +1945,6 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
19541945
&mut Inst::MovFromNZCV { ref mut rd } => {
19551946
map_def(mapper, rd);
19561947
}
1957-
&mut Inst::CondSet { ref mut rd, .. } => {
1958-
map_def(mapper, rd);
1959-
}
19601948
&mut Inst::Extend {
19611949
ref mut rd,
19621950
ref mut rn,
@@ -2830,11 +2818,6 @@ impl Inst {
28302818
let rd = rd.to_reg().show_rru(mb_rru);
28312819
format!("mrs {}, nzcv", rd)
28322820
}
2833-
&Inst::CondSet { rd, cond } => {
2834-
let rd = rd.to_reg().show_rru(mb_rru);
2835-
let cond = cond.show_rru(mb_rru);
2836-
format!("cset {}, {}", rd, cond)
2837-
}
28382821
&Inst::Extend {
28392822
rd,
28402823
rn,

cranelift/codegen/src/isa/aarch64/lower.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,24 @@ pub(crate) fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, i
10141014
}
10151015
}
10161016

1017+
/// Convert a 0 / 1 result, such as from a conditional-set instruction, into a 0
1018+
/// / -1 (all-ones) result as expected for bool operations.
1019+
pub(crate) fn normalize_bool_result<C: LowerCtx<I = Inst>>(
1020+
ctx: &mut C,
1021+
insn: IRInst,
1022+
rd: Writable<Reg>,
1023+
) {
1024+
// A boolean is 0 / -1; if output width is > 1, negate.
1025+
if ty_bits(ctx.output_ty(insn, 0)) > 1 {
1026+
ctx.emit(Inst::AluRRR {
1027+
alu_op: ALUOp::Sub64,
1028+
rd,
1029+
rn: zero_reg(),
1030+
rm: rd.to_reg(),
1031+
});
1032+
}
1033+
}
1034+
10171035
//=============================================================================
10181036
// Lowering-backend trait implementation.
10191037

cranelift/codegen/src/isa/aarch64/lower_inst.rs

Lines changed: 90 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,6 +1208,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
12081208
lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
12091209
let rd = get_output_reg(ctx, outputs[0]);
12101210
ctx.emit(Inst::CSet { rd, cond });
1211+
normalize_bool_result(ctx, insn, rd);
12111212
}
12121213

12131214
Opcode::Trueff => {
@@ -1217,6 +1218,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
12171218
lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
12181219
let rd = get_output_reg(ctx, outputs[0]);
12191220
ctx.emit(Inst::CSet { rd, cond });
1221+
normalize_bool_result(ctx, insn, rd);
12201222
}
12211223

12221224
Opcode::IsNull | Opcode::IsInvalid => {
@@ -1240,6 +1242,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
12401242
let const_value = ResultRSEImm12::Imm12(Imm12::maybe_from_u64(const_value).unwrap());
12411243
ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, const_value));
12421244
ctx.emit(Inst::CSet { rd, cond: Cond::Eq });
1245+
normalize_bool_result(ctx, insn, rd);
12431246
}
12441247

12451248
Opcode::Copy => {
@@ -1249,49 +1252,95 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
12491252
ctx.emit(Inst::gen_move(rd, rn, ty));
12501253
}
12511254

1252-
Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => {
1253-
// If this is a Bint from a Trueif/Trueff/IsNull/IsInvalid, then the result is already
1254-
// 64-bit-zero-extended, even if the CLIF type doesn't say so, because it was produced
1255-
// by a CSet. In this case, we do not need to do any zero-extension.
1256-
let input_info = ctx.get_input(insn, 0);
1257-
let src_op = input_info
1258-
.inst
1259-
.map(|(src_inst, _)| ctx.data(src_inst).opcode());
1260-
let narrow_mode = match (src_op, op) {
1261-
(Some(Opcode::Trueif), Opcode::Bint)
1262-
| (Some(Opcode::Trueff), Opcode::Bint)
1263-
| (Some(Opcode::IsNull), Opcode::Bint)
1264-
| (Some(Opcode::IsInvalid), Opcode::Bint) => NarrowValueMode::None,
1265-
_ => NarrowValueMode::ZeroExtend64,
1266-
};
1267-
1268-
// All of these ops are simply a move from a zero-extended source.
1269-
// Here is why this works, in each case:
1270-
//
1271-
// - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we
1272-
// merely need to zero-extend here.
1273-
//
1274-
// - Breduce, Bextend: changing width of a boolean. We represent a
1275-
// bool as a 0 or 1, so again, this is a zero-extend / no-op.
1276-
//
1277-
// - Ireduce: changing width of an integer. Smaller ints are stored
1278-
// with undefined high-order bits, so we can simply do a copy.
1279-
1280-
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1255+
Opcode::Breduce | Opcode::Ireduce => {
1256+
// Smaller integers/booleans are stored with high-order bits
1257+
// undefined, so we can simply do a copy.
1258+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
12811259
let rd = get_output_reg(ctx, outputs[0]);
12821260
let ty = ctx.input_ty(insn, 0);
12831261
ctx.emit(Inst::gen_move(rd, rn, ty));
12841262
}
12851263

1286-
Opcode::Bmask => {
1287-
// Bool is {0, 1}, so we can subtract from 0 to get all-1s.
1264+
Opcode::Bextend | Opcode::Bmask => {
1265+
// Bextend and Bmask both simply sign-extend. This works for:
1266+
// - Bextend, because booleans are stored as 0 / -1, so we
1267+
// sign-extend the -1 to a -1 in the wider width.
1268+
// - Bmask, because the resulting integer mask value must be
1269+
// all-ones (-1) if the argument is true.
1270+
//
1271+
// For a sign-extension from a 1-bit value (Case 1 below), we need
1272+
// to do things a bit specially, because the ISA does not have a
1273+
// 1-to-N-bit sign extension instruction. For 8-bit or wider
1274+
// sources (Case 2 below), we do a sign extension normally.
1275+
1276+
let from_ty = ctx.input_ty(insn, 0);
1277+
let to_ty = ctx.output_ty(insn, 0);
1278+
let from_bits = ty_bits(from_ty);
1279+
let to_bits = ty_bits(to_ty);
1280+
1281+
assert!(
1282+
from_bits <= 64 && to_bits <= 64,
1283+
"Vector Bextend not supported yet"
1284+
);
1285+
assert!(from_bits <= to_bits);
1286+
1287+
if from_bits == to_bits {
1288+
// Nothing.
1289+
} else if from_bits == 1 {
1290+
assert!(to_bits >= 8);
1291+
// Case 1: 1-bit to N-bit extension: AND the LSB of source into
1292+
// dest, generating a value of 0 or 1, then negate to get
1293+
// 0x000... or 0xfff...
1294+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
1295+
let rd = get_output_reg(ctx, outputs[0]);
1296+
// AND Rdest, Rsource, #1
1297+
ctx.emit(Inst::AluRRImmLogic {
1298+
alu_op: ALUOp::And64,
1299+
rd,
1300+
rn,
1301+
imml: ImmLogic::maybe_from_u64(1, I64).unwrap(),
1302+
});
1303+
// SUB Rdest, XZR, Rdest (i.e., NEG Rdest)
1304+
ctx.emit(Inst::AluRRR {
1305+
alu_op: ALUOp::Sub64,
1306+
rd,
1307+
rn: zero_reg(),
1308+
rm: rd.to_reg(),
1309+
});
1310+
} else {
1311+
// Case 2: 8-or-more-bit to N-bit extension: just sign-extend. A
1312+
// `true` (all ones, or `-1`) will be extended to -1 with the
1313+
// larger width.
1314+
assert!(from_bits >= 8);
1315+
let narrow_mode = if to_bits == 64 {
1316+
NarrowValueMode::SignExtend64
1317+
} else {
1318+
assert!(to_bits <= 32);
1319+
NarrowValueMode::SignExtend32
1320+
};
1321+
let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
1322+
let rd = get_output_reg(ctx, outputs[0]);
1323+
ctx.emit(Inst::gen_move(rd, rn, to_ty));
1324+
}
1325+
}
1326+
1327+
Opcode::Bint => {
1328+
// Booleans are stored as all-zeroes (0) or all-ones (-1). We AND
1329+
// out the LSB to give a 0 / 1-valued integer result.
1330+
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
12881331
let rd = get_output_reg(ctx, outputs[0]);
1289-
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
1290-
ctx.emit(Inst::AluRRR {
1291-
alu_op: ALUOp::Sub64,
1332+
let output_bits = ty_bits(ctx.output_ty(insn, 0));
1333+
1334+
let (imm_ty, alu_op) = if output_bits > 32 {
1335+
(I64, ALUOp::And64)
1336+
} else {
1337+
(I32, ALUOp::And32)
1338+
};
1339+
ctx.emit(Inst::AluRRImmLogic {
1340+
alu_op,
12921341
rd,
1293-
rn: zero_reg(),
1294-
rm,
1342+
rn,
1343+
imml: ImmLogic::maybe_from_u64(1, imm_ty).unwrap(),
12951344
});
12961345
}
12971346

@@ -1369,7 +1418,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
13691418
let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
13701419
let rm = put_input_in_rse_imm12(ctx, inputs[1], narrow_mode);
13711420
ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
1372-
ctx.emit(Inst::CondSet { cond, rd });
1421+
ctx.emit(Inst::CSet { cond, rd });
1422+
normalize_bool_result(ctx, insn, rd);
13731423
} else {
13741424
let rm = put_input_in_reg(ctx, inputs[1], narrow_mode);
13751425
lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
@@ -1394,7 +1444,8 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
13941444
}
13951445
_ => panic!("Bad float size"),
13961446
}
1397-
ctx.emit(Inst::CondSet { cond, rd });
1447+
ctx.emit(Inst::CSet { cond, rd });
1448+
normalize_bool_result(ctx, insn, rd);
13981449
} else {
13991450
lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
14001451
}
@@ -1659,6 +1710,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
16591710
});
16601711

16611712
ctx.emit(Inst::CSet { rd, cond: Cond::Ne });
1713+
normalize_bool_result(ctx, insn, rd);
16621714
}
16631715

16641716
Opcode::Shuffle

0 commit comments

Comments
 (0)