[BPF] Add load-acquire and store-release instructions under -mcpu=v4#108636
[BPF] Add load-acquire and store-release instructions under -mcpu=v4#108636yonghong-song merged 1 commit intollvm:mainfrom
Conversation
70f8d46 to
b154221
Compare
|
@llvm/pr-subscribers-llvm-binary-utilities @llvm/pr-subscribers-clang Author: Peilin Ye (peilin-ye) ChangesAs discussed in [1], introduce BPF instructions with load-acquire and A "load_acquire" is a BPF_LDX instruction with a new mode modifier, BPF_MEMACQ and BPF_MEMREL share the same numeric value, 0x7 (or 0b111). long foo(long *ptr) { foo() can be compiled to: f9 10 00 00 00 00 00 00 r0 = load_acquire((u64 *)(r1 + 0x0)) Opcode 0xf9, or 0b11111001, can be decoded as: 0b 111 11 001 Similarly: void bar(short *ptr, short val) { bar() can be compiled to: eb 21 00 00 00 00 00 00 store_release((u16 *)(r1 + 0x0), w2) Opcode 0xeb, or 0b11101011, can be decoded as: 0b 111 01 011 Inline assembly is also supported. For example: asm volatile("%0 = load_acquire((u64 *)(%1 + 0x0))" : Let 'llvm-objdump -d' use -mcpu=v5 by default, just like commit Add two macros, __BPF_FEATURE_LOAD_ACQUIRE and Refactored BPFSubtarget::initSubtargetFeatures(), as well as LOAD, [1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/ Patch is 21.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108636.diff 14 Files Affected:
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index a94ceee5a6a5e7..7a381e4853093a 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -67,10 +67,15 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__BPF_FEATURE_GOTOL");
Builder.defineMacro("__BPF_FEATURE_ST");
}
+
+ if (CpuVerNum >= 5) {
+ Builder.defineMacro("__BPF_FEATURE_LOAD_ACQUIRE");
+ Builder.defineMacro("__BPF_FEATURE_STORE_RELEASE");
+ }
}
-static constexpr llvm::StringLiteral ValidCPUNames[] = {"generic", "v1", "v2",
- "v3", "v4", "probe"};
+static constexpr llvm::StringLiteral ValidCPUNames[] = {
+ "generic", "v1", "v2", "v3", "v4", "v5", "probe"};
bool BPFTargetInfo::isValidCPUName(StringRef Name) const {
return llvm::is_contained(ValidCPUNames, Name);
diff --git a/clang/lib/Basic/Targets/BPF.h b/clang/lib/Basic/Targets/BPF.h
index d19b37dd4df7a7..3ca9c07f955f9b 100644
--- a/clang/lib/Basic/Targets/BPF.h
+++ b/clang/lib/Basic/Targets/BPF.h
@@ -106,7 +106,7 @@ class LLVM_LIBRARY_VISIBILITY BPFTargetInfo : public TargetInfo {
void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
bool setCPU(const std::string &Name) override {
- if (Name == "v3" || Name == "v4") {
+ if (Name == "v3" || Name == "v4" || Name == "v5") {
HasAlu32 = true;
}
diff --git a/clang/test/Misc/target-invalid-cpu-note/bpf.c b/clang/test/Misc/target-invalid-cpu-note/bpf.c
index fe925f86cdd137..7f90e64ab16f16 100644
--- a/clang/test/Misc/target-invalid-cpu-note/bpf.c
+++ b/clang/test/Misc/target-invalid-cpu-note/bpf.c
@@ -10,6 +10,7 @@
// CHECK-SAME: {{^}}, v2
// CHECK-SAME: {{^}}, v3
// CHECK-SAME: {{^}}, v4
+// CHECK-SAME: {{^}}, v5
// CHECK-SAME: {{^}}, probe
// CHECK-SAME: {{$}}
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index f79c233d93fe8e..408af398e4ef17 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -442,7 +442,7 @@ std::optional<StringRef> ELFObjectFileBase::tryGetCPUName() const {
case ELF::EM_PPC64:
return StringRef("future");
case ELF::EM_BPF:
- return StringRef("v4");
+ return StringRef("v5");
default:
return std::nullopt;
}
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 9672ed009e9be1..4721097b765a08 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -237,6 +237,7 @@ struct BPFOperand : public MCParsedAsmOperand {
.Case("exit", true)
.Case("lock", true)
.Case("ld_pseudo", true)
+ .Case("store_release", true)
.Default(false);
}
@@ -273,6 +274,7 @@ struct BPFOperand : public MCParsedAsmOperand {
.Case("cmpxchg_64", true)
.Case("cmpxchg32_32", true)
.Case("addr_space_cast", true)
+ .Case("load_acquire", true)
.Default(false);
}
};
diff --git a/llvm/lib/Target/BPF/BPF.td b/llvm/lib/Target/BPF/BPF.td
index dff76ca07af511..dd2d4989561bc3 100644
--- a/llvm/lib/Target/BPF/BPF.td
+++ b/llvm/lib/Target/BPF/BPF.td
@@ -32,6 +32,7 @@ def : Proc<"v1", []>;
def : Proc<"v2", []>;
def : Proc<"v3", [ALU32]>;
def : Proc<"v4", [ALU32]>;
+def : Proc<"v5", [ALU32]>;
def : Proc<"probe", []>;
def BPFInstPrinter : AsmWriter {
diff --git a/llvm/lib/Target/BPF/BPFInstrFormats.td b/llvm/lib/Target/BPF/BPFInstrFormats.td
index feffdbc69465ea..cb68b0d1250e6e 100644
--- a/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -94,6 +94,8 @@ def BPF_IND : BPFModeModifer<0x2>;
def BPF_MEM : BPFModeModifer<0x3>;
def BPF_MEMSX : BPFModeModifer<0x4>;
def BPF_ATOMIC : BPFModeModifer<0x6>;
+def BPF_MEMACQ : BPFModeModifer<0x7>;
+def BPF_MEMREL : BPFModeModifer<0x7>;
class BPFAtomicFlag<bits<4> val> {
bits<4> Value = val;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index f7e17901c7ed5e..79de3e77b2b123 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -60,6 +60,8 @@ def BPFHasSdivSmod : Predicate<"Subtarget->hasSdivSmod()">;
def BPFNoMovsx : Predicate<"!Subtarget->hasMovsx()">;
def BPFNoBswap : Predicate<"!Subtarget->hasBswap()">;
def BPFHasStoreImm : Predicate<"Subtarget->hasStoreImm()">;
+def BPFHasLoadAcquire : Predicate<"Subtarget->hasLoadAcquire()">;
+def BPFHasStoreRelease : Predicate<"Subtarget->hasStoreRelease()">;
class ImmediateAsmOperand<string name> : AsmOperandClass {
let Name = name;
@@ -497,12 +499,11 @@ def LD_pseudo
}
// STORE instructions
-class STORE<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
- : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+class STORE<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string AsmString, list<dag> Pattern>
+ : TYPE_LD_ST<ModOp.Value, SizeOp.Value,
(outs),
(ins GPR:$src, MEMri:$addr),
- "*("#OpcodeStr#" *)($addr) = $src",
- Pattern> {
+ AsmString, Pattern> {
bits<4> src;
bits<20> addr;
@@ -513,7 +514,11 @@ class STORE<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
}
class STOREi64<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
- : STORE<Opc, OpcodeStr, [(OpNode GPR:$src, ADDRri:$addr)]>;
+ : STORE<Opc, BPF_MEM, "*("#OpcodeStr#" *)($addr) = $src", [(OpNode GPR:$src, ADDRri:$addr)]>;
+
+class STORE_RELEASEi64<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
+ : STORE<Opc, BPF_MEMREL, "store_release(("#OpcodeStr#" *)($addr), $src)",
+ [(OpNode GPR:$src, ADDRri:$addr)]>;
let Predicates = [BPFNoALU32] in {
def STW : STOREi64<BPF_W, "u32", truncstorei32>;
@@ -522,6 +527,16 @@ let Predicates = [BPFNoALU32] in {
}
def STD : STOREi64<BPF_DW, "u64", store>;
+class releasing_store<PatFrag base>
+ : PatFrag<(ops node:$val, node:$ptr), (base node:$val, node:$ptr)> {
+ let IsAtomic = 1;
+ let IsAtomicOrderingRelease = 1;
+}
+
+let Predicates = [BPFHasStoreRelease] in {
+ def STDREL : STORE_RELEASEi64<BPF_DW, "u64", releasing_store<atomic_store_64>>;
+}
+
class STORE_imm<BPFWidthModifer SizeOp,
string OpcodeStr, dag Pattern>
: TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
@@ -567,12 +582,11 @@ let Predicates = [BPFHasALU32, BPFHasStoreImm] in {
}
// LOAD instructions
-class LOAD<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, list<dag> Pattern>
+class LOAD<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string AsmString, list<dag> Pattern>
: TYPE_LD_ST<ModOp.Value, SizeOp.Value,
(outs GPR:$dst),
(ins MEMri:$addr),
- "$dst = *("#OpcodeStr#" *)($addr)",
- Pattern> {
+ AsmString, Pattern> {
bits<4> dst;
bits<20> addr;
@@ -583,7 +597,13 @@ class LOAD<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, list<
}
class LOADi64<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, PatFrag OpNode>
- : LOAD<SizeOp, ModOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
+ : LOAD<SizeOp, ModOp, "$dst = *("#OpcodeStr#" *)($addr)",
+ [(set i64:$dst, (OpNode ADDRri:$addr))]>;
+
+class LOAD_ACQUIREi64<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr,
+ PatFrag OpNode>
+ : LOAD<SizeOp, ModOp, "$dst = load_acquire(("#OpcodeStr#" *)($addr))",
+ [(set i64:$dst, (OpNode ADDRri:$addr))]>;
let isCodeGenOnly = 1 in {
class CORE_LD<RegisterClass RegClass, string Sz>
@@ -622,6 +642,16 @@ let Predicates = [BPFHasLdsx] in {
def LDD : LOADi64<BPF_DW, BPF_MEM, "u64", load>;
+class acquiring_load<PatFrags base>
+ : PatFrag<(ops node:$ptr), (base node:$ptr)> {
+ let IsAtomic = 1;
+ let IsAtomicOrderingAcquire = 1;
+}
+
+let Predicates = [BPFHasLoadAcquire] in {
+ def LDDACQ : LOAD_ACQUIREi64<BPF_DW, BPF_MEMACQ, "u64", acquiring_load<atomic_load_64>>;
+}
+
class BRANCH<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
: TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
(outs),
@@ -1069,12 +1099,11 @@ def : Pat<(i32 (trunc GPR:$src)),
def : Pat<(i64 (anyext GPR32:$src)),
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-class STORE32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
- : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+class STORE32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string AsmString, list<dag> Pattern>
+ : TYPE_LD_ST<ModOp.Value, SizeOp.Value,
(outs),
(ins GPR32:$src, MEMri:$addr),
- "*("#OpcodeStr#" *)($addr) = $src",
- Pattern> {
+ AsmString, Pattern> {
bits<4> src;
bits<20> addr;
@@ -1085,20 +1114,30 @@ class STORE32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
}
class STOREi32<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
- : STORE32<Opc, OpcodeStr, [(OpNode GPR32:$src, ADDRri:$addr)]>;
+ : STORE32<Opc, BPF_MEM, "*("#OpcodeStr#" *)($addr) = $src",
+ [(OpNode GPR32:$src, ADDRri:$addr)]>;
+
+class STORE_RELEASEi32<BPFWidthModifer Opc, string OpcodeStr, PatFrag OpNode>
+ : STORE32<Opc, BPF_MEMREL, "store_release(("#OpcodeStr#" *)($addr), $src)",
+ [(OpNode GPR32:$src, ADDRri:$addr)]>;
let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
def STW32 : STOREi32<BPF_W, "u32", store>;
def STH32 : STOREi32<BPF_H, "u16", truncstorei16>;
def STB32 : STOREi32<BPF_B, "u8", truncstorei8>;
+
+ let Predicates = [BPFHasStoreRelease] in {
+ def STWREL32 : STORE_RELEASEi32<BPF_W, "u32", releasing_store<atomic_store_32>>;
+ def STHREL32 : STORE_RELEASEi32<BPF_H, "u16", releasing_store<atomic_store_16>>;
+ def STBREL32 : STORE_RELEASEi32<BPF_B, "u8", releasing_store<atomic_store_8>>;
+ }
}
-class LOAD32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, list<dag> Pattern>
+class LOAD32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string AsmString, list<dag> Pattern>
: TYPE_LD_ST<ModOp.Value, SizeOp.Value,
(outs GPR32:$dst),
(ins MEMri:$addr),
- "$dst = *("#OpcodeStr#" *)($addr)",
- Pattern> {
+ AsmString, Pattern> {
bits<4> dst;
bits<20> addr;
@@ -1109,12 +1148,24 @@ class LOAD32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, lis
}
class LOADi32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, PatFrag OpNode>
- : LOAD32<SizeOp, ModOp, OpcodeStr, [(set i32:$dst, (OpNode ADDRri:$addr))]>;
+ : LOAD32<SizeOp, ModOp, "$dst = *("#OpcodeStr#" *)($addr)",
+ [(set i32:$dst, (OpNode ADDRri:$addr))]>;
+
+class LOAD_ACQUIREi32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr,
+ PatFrag OpNode>
+ : LOAD32<SizeOp, ModOp, "$dst = load_acquire(("#OpcodeStr#" *)($addr))",
+ [(set i32:$dst, (OpNode ADDRri:$addr))]>;
let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
def LDW32 : LOADi32<BPF_W, BPF_MEM, "u32", load>;
def LDH32 : LOADi32<BPF_H, BPF_MEM, "u16", zextloadi16>;
def LDB32 : LOADi32<BPF_B, BPF_MEM, "u8", zextloadi8>;
+
+ let Predicates = [BPFHasLoadAcquire] in {
+ def LDWACQ32 : LOAD_ACQUIREi32<BPF_W, BPF_MEMACQ, "u32", acquiring_load<atomic_load_32>>;
+ def LDHACQ32 : LOAD_ACQUIREi32<BPF_H, BPF_MEMACQ, "u16", acquiring_load<atomic_load_az_16>>;
+ def LDBACQ32 : LOAD_ACQUIREi32<BPF_B, BPF_MEMACQ, "u8", acquiring_load<atomic_load_az_8>>;
+ }
}
let Predicates = [BPFHasALU32] in {
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index 39390e8c38f8c1..0763550cb03419 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -100,21 +100,25 @@ static bool isST(unsigned Opcode) {
}
static bool isSTX32(unsigned Opcode) {
- return Opcode == BPF::STB32 || Opcode == BPF::STH32 || Opcode == BPF::STW32;
+ return Opcode == BPF::STB32 || Opcode == BPF::STH32 || Opcode == BPF::STW32 ||
+ Opcode == BPF::STBREL32 || Opcode == BPF::STHREL32 ||
+ Opcode == BPF::STWREL32;
}
static bool isSTX64(unsigned Opcode) {
return Opcode == BPF::STB || Opcode == BPF::STH || Opcode == BPF::STW ||
- Opcode == BPF::STD;
+ Opcode == BPF::STD || Opcode == BPF::STDREL;
}
static bool isLDX32(unsigned Opcode) {
- return Opcode == BPF::LDB32 || Opcode == BPF::LDH32 || Opcode == BPF::LDW32;
+ return Opcode == BPF::LDB32 || Opcode == BPF::LDH32 || Opcode == BPF::LDW32 ||
+ Opcode == BPF::LDBACQ32 || Opcode == BPF::LDHACQ32 ||
+ Opcode == BPF::LDWACQ32;
}
static bool isLDX64(unsigned Opcode) {
return Opcode == BPF::LDB || Opcode == BPF::LDH || Opcode == BPF::LDW ||
- Opcode == BPF::LDD;
+ Opcode == BPF::LDD || Opcode == BPF::LDDACQ;
}
static bool isLDSX(unsigned Opcode) {
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 305e9a2bf2cda3..a52c39efccedcb 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -40,6 +40,12 @@ static cl::opt<bool> Disable_gotol("disable-gotol", cl::Hidden, cl::init(false),
static cl::opt<bool>
Disable_StoreImm("disable-storeimm", cl::Hidden, cl::init(false),
cl::desc("Disable BPF_ST (immediate store) insn"));
+static cl::opt<bool>
+ Disable_load_acquire("disable-load-acquire", cl::Hidden, cl::init(false),
+ cl::desc("Disable load-acquire insns"));
+static cl::opt<bool>
+ Disable_store_release("disable-store-release", cl::Hidden, cl::init(false),
+ cl::desc("Disable store-release insns"));
void BPFSubtarget::anchor() {}
@@ -62,6 +68,8 @@ void BPFSubtarget::initializeEnvironment() {
HasSdivSmod = false;
HasGotol = false;
HasStoreImm = false;
+ HasLoadAcquire = false;
+ HasStoreRelease = false;
}
void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -69,29 +77,30 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
CPU = "v3";
if (CPU == "probe")
CPU = sys::detail::getHostCPUNameForBPF();
- if (CPU == "generic" || CPU == "v1")
- return;
- if (CPU == "v2") {
- HasJmpExt = true;
+ if (CPU.empty() || CPU == "generic" || CPU == "v1")
return;
- }
- if (CPU == "v3") {
+
+ int CpuVerNum = CPU.back() - '0';
+ if (CpuVerNum >= 2)
HasJmpExt = true;
+
+ if (CpuVerNum >= 3) {
HasJmp32 = true;
HasAlu32 = true;
- return;
}
- if (CPU == "v4") {
- HasJmpExt = true;
- HasJmp32 = true;
- HasAlu32 = true;
+
+ if (CpuVerNum >= 4) {
HasLdsx = !Disable_ldsx;
HasMovsx = !Disable_movsx;
HasBswap = !Disable_bswap;
HasSdivSmod = !Disable_sdiv_smod;
HasGotol = !Disable_gotol;
HasStoreImm = !Disable_StoreImm;
- return;
+ }
+
+ if (CpuVerNum >= 5) {
+ HasLoadAcquire = !Disable_load_acquire;
+ HasStoreRelease = !Disable_store_release;
}
}
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.h b/llvm/lib/Target/BPF/BPFSubtarget.h
index 33747546eadc3b..aa7995c3af5ecf 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -66,6 +66,9 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
// whether cpu v4 insns are enabled.
bool HasLdsx, HasMovsx, HasBswap, HasSdivSmod, HasGotol, HasStoreImm;
+ // whether cpu v5 insns are enabled.
+ bool HasLoadAcquire, HasStoreRelease;
+
std::unique_ptr<CallLowering> CallLoweringInfo;
std::unique_ptr<InstructionSelector> InstSelector;
std::unique_ptr<LegalizerInfo> Legalizer;
@@ -92,6 +95,8 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
bool hasSdivSmod() const { return HasSdivSmod; }
bool hasGotol() const { return HasGotol; }
bool hasStoreImm() const { return HasStoreImm; }
+ bool hasLoadAcquire() const { return HasLoadAcquire; }
+ bool hasStoreRelease() const { return HasStoreRelease; }
bool isLittleEndian() const { return IsLittleEndian; }
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 536bee5393843a..d1b9769d3bed96 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -58,7 +58,9 @@ class BPFDisassembler : public MCDisassembler {
BPF_IND = 0x2,
BPF_MEM = 0x3,
BPF_MEMSX = 0x4,
- BPF_ATOMIC = 0x6
+ BPF_ATOMIC = 0x6,
+ BPF_MEMACQ = 0x7,
+ BPF_MEMREL = 0x7
};
BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
@@ -177,7 +179,8 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
uint8_t InstMode = getInstMode(Insn);
if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
getInstSize(Insn) != BPF_DW &&
- (InstMode == BPF_MEM || InstMode == BPF_ATOMIC) &&
+ (InstMode == BPF_MEM || InstMode == BPF_ATOMIC ||
+ InstMode == BPF_MEMACQ /* or BPF_MEMREL */) &&
STI.hasFeature(BPF::ALU32))
Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
this, STI);
diff --git a/llvm/test/CodeGen/BPF/acquire-release.ll b/llvm/test/CodeGen/BPF/acquire-release.ll
new file mode 100644
index 00000000000000..1c7db417b6c43f
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/acquire-release.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -march=bpfel -mcpu=v5 -verify-machineinstrs -show-mc-encoding | FileCheck %s
+;
+; Source:
+; char load_acquire_i8(char *p) {
+; return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+; }
+; short load_acquire_i16(short *p) {
+; return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+; }
+; int load_acquire_i32(int *p) {
+; return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+; }
+; long load_acquire_i64(long *p) {
+; return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+; }
+; void store_release_i8(char *p, char v) {
+; __atomic_store_n(p, v, __ATOMIC_RELEASE);
+; }
+; void store_release_i16(short *p, short v) {
+; __atomic_store_n(p, v, __ATOMIC_RELEASE);
+; }
+; void store_release_i32(int *p, int v) {
+; __atomic_store_n(p, v, __ATOMIC_RELEASE);
+; }
+; void store_release_i64(long *p, long v) {
+; __atomic_store_n(p, v, __ATOMIC_RELEASE);
+; }
+
+; CHECK-LABEL: load_acquire_i8
+; CHECK: w0 = load_acquire((u8 *)(r1 + 0)) # encoding: [0xf1,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+define dso_local i8 @load_acquire_i8(ptr nocapture noundef readonly %p) local_unnamed_addr {
+entry:
+ %0 = load atomic i8, ptr %p acquire, align 1
+ ret i8 %0
+}
+
+; CHECK-LABEL: load_acquire_i16
+; CHECK: w0 = load_acquire((u16 *)(r1 + 0)) # encoding: [0xe9,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+define dso_local i16 @load_acquire_i16(ptr nocapture noundef readonly %p) local_unnamed_addr {
+entry:
+ %0 = load atomic i16, ptr %p acquire, align 2
+ ret i16 %0
+}
+
+; CHECK-LABEL: load_acquire_i32
+; CHECK: w0 = load_acquire((u32 *)(r1 + 0)) # encoding: [0xe1,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+define dso_local i32 @load_acquire_i32(ptr nocapture noundef readonly %p) local_unnamed_addr {
+entry:
+ %0 = load atomic i32, ptr %p acquire, align 4
+ ret i32 %0
+}
+
+; CHECK-LABEL: load_acquire_i64
+; CHECK: r0 = load_acquire((u64 *)(r1 + 0)) # encoding: [0xf9,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+define dso_local i64 @load_acquire_i64(ptr nocapture noundef readonly %p) local_unnamed_addr {
+entry:
+ %0 = load atomic i64, ptr %p acquire, align 8
+ ret i64 %0
+}
+
+; CHECK-LABEL: store_release_i8
+; CHECK: store_release((u8 *)(r1 + 0), w2) # encoding: [0xf3,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+define void @store_release_i8(ptr nocapture noundef writeonly %p,
+ i8 noundef signext %v) local_unnamed_addr {
+entry:
+ store atomic i8 %v, ptr %p release, align 1
+ ret void
+}
+
+; CHECK-LABEL: store_release_i16
+; CHECK: store_release((u16 *)(r1 + 0), w2) # encoding: [0xeb,0x21,0x00,0x00,0x00,0x00,0x00,0x00]
+define void @store_release_i16(ptr nocapture noundef writeonly %p,
+ i16 noundef signext %v) local_unnamed_addr ...
[truncated]
|
There was a problem hiding this comment.
I think this looks fine, but syntax seems like it adds to the zoo we already have.
E.g. there are already instructions like lock *(u32 *)(r1 + 0x0) &= w2 and we treat these as having monotonic / memory_order_relaxed semantics (see #107343).
It seems, to me at-least, that it is more logical to build on top of already existing syntax, e.g.:
lock *(u64 *)(r1 + 0x0) = r2 release
lock r2 = *(u64 *)(r1 + 0x0) acquire
Also a question regarding 3 commits in one pull request.
As far as I understand current policy, the idea is to have a separate pull request for each commit (and fork branches from one another to build a stack). Otherwise the commits are squashed. What's the idea with current pull request?
|
Hi @eddyz87, thanks for the review and context!
Appending However, I didn't want to use
Got it, I'll split this into separate PRs later. |
Well, we already confuse people in a way, since existing |
(Not that I like this policy, but that's the way LLVM team decided to use github 🤷♂️) |
tbh I don't like such syntax. It's harder to read comparing to what the patch does: "lock" part doesn't fit here either. "lock" is x86 specific. |
b154221 to
692d130
Compare
|
(pushed v2 to resolve the easier issues first :-)
I couldn't find a way to generate a better error message for |
There is probably some tablegen incantation to invoke custom cpp for some matched pattern but I haven't found it. Also I have not found any target info hooks for clang to report unsupported orderings. One option is to extend existing mechanics: diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index ff23d3b055d0..9e54c7f3de65 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -91,6 +91,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD, VT, Custom);
}
for (auto VT : { MVT::i32, MVT::i64 }) {
@@ -291,6 +292,14 @@ void BPFTargetLowering::ReplaceNodeResults(
else
Msg = "unsupported atomic operation, please use 64 bit version";
break;
+ case ISD::ATOMIC_LOAD: {
+ auto *AtomicLoad = cast<AtomicSDNode>(N);
+ if (AtomicLoad->getMergedOrdering() != AtomicOrdering::AcquireRelease &&
+ AtomicLoad->getMergedOrdering() != AtomicOrdering::SequentiallyConsistent)
+ return;
+ Msg = "a useful message about unsupported ordering here";
+ break;
+ }
}
SDLoc DL(N);When compiled with |
|
Thanks! I didn't know about
Can we
if (DI.getSeverity() == DS_Error)
exit(1);
}
I tried, based on your diff ( --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -93,6 +93,9 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
}
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64})
+ setOperationAction(ISD::ATOMIC_LOAD, VT, Custom);
+
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i32 && !STI.getHasAlu32())
continue;
@@ -291,6 +294,8 @@ void BPFTargetLowering::ReplaceNodeResults(
else
Msg = "unsupported atomic operation, please use 64 bit version";
break;
+ case ISD::ATOMIC_LOAD:
+ return;
}
SDLoc DL(N);
@@ -316,6 +321,8 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerSDIVSREM(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::ATOMIC_LOAD:
+ return LowerATOMIC_LOAD(Op, DAG);
}
}
@@ -703,6 +710,22 @@ SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(BPFISD::SELECT_CC, DL, VTs, Ops);
}
+SDValue BPFTargetLowering::LowerATOMIC_LOAD(SDValue Op,
+ SelectionDAG &DAG) const {
+ const char *Msg =
+ "sequentially consistent (seq_cst) atomic load is not supported";
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+
+ if (cast<AtomicSDNode>(N)->getMergedOrdering() ==
+ AtomicOrdering::SequentiallyConsistent) {
+ fail(DL, DAG, Msg);
+ exit(1);
+ }
+
+ return Op;
+}
+
const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((BPFISD::NodeType)Opcode) {
case BPFISD::FIRST_NUMBER:--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -77,7 +77,7 @@ private:
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-
+ SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
which seems to work nicely: |
|
Sorry for the delayed reply.
Let's not hard-code the I'd say let's go with what you suggest, but avoid |
692d130 to
2925e05
Compare
No worries at all!
Got it, rebased and added a 4th commit (can't say I like the one-commit-per-PR policy either :-) to do this in v3. Please take another look. Also, just a heads-up: I think this PR is blocked by an issue that I mentioned earlier on LKML (Cc: @4ast). After that is resolved, I think we should change this PR to use |
lgtm, but please note CI failure: |
Oops, I'll take a closer look later. Thanks! |
this PR needs to land together with kernel/verifier changes. It's necessary to get llvm bits in landable shape first and |
06ac94b to
e23e67c
Compare
|
(Back on this; apologies for the delay) Pushed v4 to fix the Assuming |
|
Hi @4ast,
Got it! I wanted to get your opinion on the linked issue though: This PR can use either However, right now both /* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32 0xa0( /* unused opcode to mark special atomic instruction */
#define BPF_PROBE_ATOMIC 0xe0( Can I move them elsewhere to make room for this PR? Like, can I make them use the unused higher bits of the |
|
@peilin-ye Sorry for late review. A few general observations below:
From the source code, the operation should return a 'short' type value. But since the result will be 32bit value, should we do sign-extension here? w0 = load_acquire((s16 *)(r1 + 0x0))?
Since load_acquire and store_release are essentially atomic operations (e.g. __atomic_load_n() and __atomic_store_n()), is it possible to add acquire/release support in BPF_ATOMIC? We might want to reserve BPFModelModifier 7 for future use. |
|
Changes in v7:
Introduce the following new flags for memory orders: Include all orders defined by the C++ standard except
|
|
I remembered I mentioned earlier that for __ATOMIC_RELAXED, we should just use plain load/store. But I have the following example, For fetch_and_*() operations if it is __ATOMIC_RELAXED, the code will be different from other atomic flavor. |
|
Sure Yonghong, I can include that in this PR. Similarly, since we cannot have both int foo(char *ptr) {
return __atomic_load_n(ptr, __ATOMIC_RELAXED);
}This'll be compiled into: |
57d1075 to
a5ea07b
Compare
|
Pushed v8 to make it generate plain ( |
a5ea07b to
ddce643
Compare
|
Pushed v9:
|
Okay, LGTM. Thanks! |
|
What is the status of corresponding kernel changes? |
|
Hi @4ast, I modified the verifier, and was able to load and dump a program (using |
ddce643 to
43f8a40
Compare
|
Pushed v10:
|
43f8a40 to
68d0e07
Compare
As discussed in [1], introduce BPF instructions with load-acquire and
store-release semantics under -mcpu=v4. Define 2 new flags:
BPF_LOAD_ACQ 0x100
BPF_STORE_REL 0x110
A "load-acquire" is a BPF_STX | BPF_ATOMIC instruction with the 'imm'
field set to BPF_LOAD_ACQ (0x100).
Similarly, a "store-release" is a BPF_STX | BPF_ATOMIC instruction with
the 'imm' field set to BPF_STORE_REL (0x110).
Unlike existing atomic read-modify-write operations that only support
BPF_W (32-bit) and BPF_DW (64-bit) size modifiers, load-acquires and
store-releases also support BPF_B (8-bit) and BPF_H (16-bit). An 8- or
16-bit load-acquire zero-extends the value before writing it to a 32-bit
register, just like ARM64 instruction LDAPRH and friends.
As an example (assuming little-endian):
long foo(long *ptr) {
return __atomic_load_n(ptr, __ATOMIC_ACQUIRE);
}
foo() can be compiled to:
db 10 00 00 00 01 00 00 r0 = load_acquire((u64 *)(r1 + 0x0))
95 00 00 00 00 00 00 00 exit
opcode (0xdb): BPF_ATOMIC | BPF_DW | BPF_STX
imm (0x00000100): BPF_LOAD_ACQ
Similarly:
void bar(short *ptr, short val) {
__atomic_store_n(ptr, val, __ATOMIC_RELEASE);
}
bar() can be compiled to:
cb 21 00 00 10 01 00 00 store_release((u16 *)(r1 + 0x0), w2)
95 00 00 00 00 00 00 00 exit
opcode (0xcb): BPF_ATOMIC | BPF_H | BPF_STX
imm (0x00000110): BPF_STORE_REL
Inline assembly is also supported.
Add a pre-defined macro, __BPF_FEATURE_LOAD_ACQ_STORE_REL, to let
developers detect this new feature. It can also be disabled using a new
llc option, -disable-load-acq-store-rel.
Using __ATOMIC_RELAXED for __atomic_store{,_n}() will generate a "plain"
store (BPF_MEM | BPF_STX) instruction:
void foo(short *ptr, short val) {
__atomic_store_n(ptr, val, __ATOMIC_RELAXED);
}
6b 21 00 00 00 00 00 00 *(u16 *)(r1 + 0x0) = w2
95 00 00 00 00 00 00 00 exit
Similarly, using __ATOMIC_RELAXED for __atomic_load{,_n}() will generate
a zero-extending, "plain" load (BPF_MEM | BPF_LDX) instruction:
int foo(char *ptr) {
return __atomic_load_n(ptr, __ATOMIC_RELAXED);
}
71 11 00 00 00 00 00 00 w1 = *(u8 *)(r1 + 0x0)
bc 10 08 00 00 00 00 00 w0 = (s8)w1
95 00 00 00 00 00 00 00 exit
Currently __ATOMIC_CONSUME is an alias for __ATOMIC_ACQUIRE. Using
__ATOMIC_SEQ_CST ("sequentially consistent") is not supported yet and
will cause an error:
$ clang --target=bpf -mcpu=v4 -c bar.c > /dev/null
bar.c:1:5: error: sequentially consistent (seq_cst) atomic load/store is not supported
1 | int foo(int *ptr) { return __atomic_load_n(ptr, __ATOMIC_SEQ_CST); }
| ^
...
Finally, rename those isST*() and isLD*() helper functions in
BPFMISimplifyPatchable.cpp based on what the instructions actually do,
rather than their instruction class.
[1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/
68d0e07 to
37a4553
Compare
|
When landing, please simply land this PR as-is - considering that I've made several references to this PR on the mailing list, and the two other commits except the "main" one are pretty small, I no longer think it's worth it to restructure this into stacked PRs. For future contributions, I'll make sure to use stacked PRs from the beginning and avoid squashing/force-pushing. Thanks! Pushed v11:
|
Peilin Ye says: ==================== Introduce load-acquire and store-release BPF instructions This patchset adds kernel support for BPF load-acquire and store-release instructions (for background, please see [1]), including core/verifier and arm64/x86-64 JIT compiler changes, as well as selftests. riscv64 is also planned to be supported. The corresponding LLVM changes can be found at: llvm/llvm-project#108636 The first 3 patches from v4 have already been applied: - [bpf-next,v4,01/10] bpf/verifier: Factor out atomic_ptr_type_ok() https://git.kernel.org/bpf/bpf-next/c/b2d9ef71d4c9 - [bpf-next,v4,02/10] bpf/verifier: Factor out check_atomic_rmw() https://git.kernel.org/bpf/bpf-next/c/d430c46c7580 - [bpf-next,v4,03/10] bpf/verifier: Factor out check_load_mem() and check_store_reg() https://git.kernel.org/bpf/bpf-next/c/d38ad248fb7a Please refer to the LLVM PR and individual kernel patches for details. Thanks! v5: https://lore.kernel.org/all/cover.1741046028.git.yepeilin@google.com/ v5..v6 change: o (Alexei) avoid using #ifndef in verifier.c v4: https://lore.kernel.org/bpf/cover.1740978603.git.yepeilin@google.com/ v4..v5 notable changes: o (kernel test robot) for 32-bit arches: make the verifier reject 64-bit load-acquires/store-releases, and fix build error in interpreter changes * tested ARCH=arc build following instructions from kernel test robot o (Alexei) drop Documentation/ patch (v4 10/10) for now v3: https://lore.kernel.org/bpf/cover.1740009184.git.yepeilin@google.com/ v3..v4 notable changes: o (Alexei) add x86-64 JIT support (including arena) o add Acked-by: tags from Xu v2: https://lore.kernel.org/bpf/cover.1738888641.git.yepeilin@google.com/ v2..v3 notable changes: o (Alexei) change encoding to BPF_LOAD_ACQ=0x100, BPF_STORE_REL=0x110 o add Acked-by: tags from Ilya and Eduard o make new selftests depend on: * __clang_major__ >= 18, and * ENABLE_ATOMICS_TESTS is defined (currently this means -mcpu=v3 or v4), and * JIT supports load_acq/store_rel (currenty only arm64) o work around llvm-17 CI job failure by conditionally define __arena_global variables as 64-bit if __clang_major__ < 18, to make sure .addr_space.1 has no holes o add Google copyright notice in new files v1: https://lore.kernel.org/all/cover.1737763916.git.yepeilin@google.com/ v1..v2 notable changes: o (Eduard) for x86 and s390, make bpf_jit_supports_insn(..., /*in_arena=*/true) return false for load_acq/store_rel o add Eduard's Acked-by: tag o (Eduard) extract LDX and non-ATOMIC STX handling into helpers, see PATCH v2 3/9 o allow unpriv programs to store-release pointers to stack o (Alexei) make it clearer in the interpreter code (PATCH v2 4/9) that only W and DW are supported for atomic RMW o test misaligned load_acq/store_rel o (Eduard) other selftests/ changes: * test load_acq/store_rel with !atomic_ptr_type_ok() pointers: - PTR_TO_CTX, for is_ctx_reg() - PTR_TO_PACKET, for is_pkt_reg() - PTR_TO_FLOW_KEYS, for is_flow_key_reg() - PTR_TO_SOCKET, for is_sk_reg() * drop atomics/ tests * delete unnecessary 'pid' checks from arena_atomics/ tests * avoid depending on __BPF_FEATURE_LOAD_ACQ_STORE_REL, use __imm_insn() and inline asm macros instead RFC v1: https://lore.kernel.org/all/cover.1734742802.git.yepeilin@google.com RFC v1..v1 notable changes: o 1-2/8: minor verifier.c refactoring patches o 3/8: core/verifier changes * (Eduard) handle load-acquire properly in backtrack_insn() * (Eduard) avoid skipping checks (e.g., bpf_jit_supports_insn()) for load-acquires * track the value stored by store-releases, just like how non-atomic STX instructions are handled * (Eduard) add missing link in commit message * (Eduard) always print 'r' for disasm.c changes o 4/8: arm64/insn: avoid treating load_acq/store_rel as load_ex/store_ex o 5/8: arm64/insn: add load_acq/store_rel * (Xu) include Should-Be-One (SBO) bits in "mask" and "value", to avoid setting fixed bits during runtime (JIT-compile time) o 6/8: arm64 JIT compiler changes * (Xu) use emit_a64_add_i() for "pointer + offset" to optimize code emission o 7/8: selftests * (Eduard) avoid adding new tests to the 'test_verifier' runner * add more tests, e.g., checking mark_precise logic o 8/8: instruction-set.rst changes [1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/ Thanks, ==================== Link: https://patch.msgid.link/cover.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
Hi @4ast, is there anything else needed before we can land this PR (I don't have write access)? Thanks! |
|
@peilin-ye I can merge the patch. IIUC, the only new change is the following encoding since my last review: BPF_LOAD_ACQ=0x100, BPF_STORE_REL=0x110. |
lgtm. |
Peilin Ye says: ==================== Introduce load-acquire and store-release BPF instructions This patchset adds kernel support for BPF load-acquire and store-release instructions (for background, please see [1]), including core/verifier and arm64/x86-64 JIT compiler changes, as well as selftests. riscv64 is also planned to be supported. The corresponding LLVM changes can be found at: llvm/llvm-project#108636 The first 3 patches from v4 have already been applied: - [bpf-next,v4,01/10] bpf/verifier: Factor out atomic_ptr_type_ok() https://git.kernel.org/bpf/bpf-next/c/b2d9ef71d4c9 - [bpf-next,v4,02/10] bpf/verifier: Factor out check_atomic_rmw() https://git.kernel.org/bpf/bpf-next/c/d430c46c7580 - [bpf-next,v4,03/10] bpf/verifier: Factor out check_load_mem() and check_store_reg() https://git.kernel.org/bpf/bpf-next/c/d38ad248fb7a Please refer to the LLVM PR and individual kernel patches for details. Thanks! v5: https://lore.kernel.org/all/cover.1741046028.git.yepeilin@google.com/ v5..v6 change: o (Alexei) avoid using #ifndef in verifier.c v4: https://lore.kernel.org/bpf/cover.1740978603.git.yepeilin@google.com/ v4..v5 notable changes: o (kernel test robot) for 32-bit arches: make the verifier reject 64-bit load-acquires/store-releases, and fix build error in interpreter changes * tested ARCH=arc build following instructions from kernel test robot o (Alexei) drop Documentation/ patch (v4 10/10) for now v3: https://lore.kernel.org/bpf/cover.1740009184.git.yepeilin@google.com/ v3..v4 notable changes: o (Alexei) add x86-64 JIT support (including arena) o add Acked-by: tags from Xu v2: https://lore.kernel.org/bpf/cover.1738888641.git.yepeilin@google.com/ v2..v3 notable changes: o (Alexei) change encoding to BPF_LOAD_ACQ=0x100, BPF_STORE_REL=0x110 o add Acked-by: tags from Ilya and Eduard o make new selftests depend on: * __clang_major__ >= 18, and * ENABLE_ATOMICS_TESTS is defined (currently this means -mcpu=v3 or v4), and * JIT supports load_acq/store_rel (currenty only arm64) o work around llvm-17 CI job failure by conditionally define __arena_global variables as 64-bit if __clang_major__ < 18, to make sure .addr_space.1 has no holes o add Google copyright notice in new files v1: https://lore.kernel.org/all/cover.1737763916.git.yepeilin@google.com/ v1..v2 notable changes: o (Eduard) for x86 and s390, make bpf_jit_supports_insn(..., /*in_arena=*/true) return false for load_acq/store_rel o add Eduard's Acked-by: tag o (Eduard) extract LDX and non-ATOMIC STX handling into helpers, see PATCH v2 3/9 o allow unpriv programs to store-release pointers to stack o (Alexei) make it clearer in the interpreter code (PATCH v2 4/9) that only W and DW are supported for atomic RMW o test misaligned load_acq/store_rel o (Eduard) other selftests/ changes: * test load_acq/store_rel with !atomic_ptr_type_ok() pointers: - PTR_TO_CTX, for is_ctx_reg() - PTR_TO_PACKET, for is_pkt_reg() - PTR_TO_FLOW_KEYS, for is_flow_key_reg() - PTR_TO_SOCKET, for is_sk_reg() * drop atomics/ tests * delete unnecessary 'pid' checks from arena_atomics/ tests * avoid depending on __BPF_FEATURE_LOAD_ACQ_STORE_REL, use __imm_insn() and inline asm macros instead RFC v1: https://lore.kernel.org/all/cover.1734742802.git.yepeilin@google.com RFC v1..v1 notable changes: o 1-2/8: minor verifier.c refactoring patches o 3/8: core/verifier changes * (Eduard) handle load-acquire properly in backtrack_insn() * (Eduard) avoid skipping checks (e.g., bpf_jit_supports_insn()) for load-acquires * track the value stored by store-releases, just like how non-atomic STX instructions are handled * (Eduard) add missing link in commit message * (Eduard) always print 'r' for disasm.c changes o 4/8: arm64/insn: avoid treating load_acq/store_rel as load_ex/store_ex o 5/8: arm64/insn: add load_acq/store_rel * (Xu) include Should-Be-One (SBO) bits in "mask" and "value", to avoid setting fixed bits during runtime (JIT-compile time) o 6/8: arm64 JIT compiler changes * (Xu) use emit_a64_add_i() for "pointer + offset" to optimize code emission o 7/8: selftests * (Eduard) avoid adding new tests to the 'test_verifier' runner * add more tests, e.g., checking mark_precise logic o 8/8: instruction-set.rst changes [1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/ Thanks, ==================== Link: https://patch.msgid.link/cover.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
As discussed in [1], introduce BPF instructions with load-acquire and
store-release semantics under -mcpu=v4. Define 2 new flags:
BPF_LOAD_ACQ 0x100
BPF_STORE_REL 0x110
A "load-acquire" is a BPF_STX | BPF_ATOMIC instruction with the 'imm'
field set to BPF_LOAD_ACQ (0x100).
Similarly, a "store-release" is a BPF_STX | BPF_ATOMIC instruction with
the 'imm' field set to BPF_STORE_REL (0x110).
Unlike existing atomic read-modify-write operations that only support
BPF_W (32-bit) and BPF_DW (64-bit) size modifiers, load-acquires and
store-releases also support BPF_B (8-bit) and BPF_H (16-bit). An 8- or
16-bit load-acquire zero-extends the value before writing it to a 32-bit
register, just like ARM64 instruction LDAPRH and friends.
As an example (assuming little-endian):
long foo(long *ptr) {
return __atomic_load_n(ptr, __ATOMIC_ACQUIRE);
}
foo() can be compiled to:
db 10 00 00 00 01 00 00 r0 = load_acquire((u64 *)(r1 + 0x0))
95 00 00 00 00 00 00 00 exit
opcode (0xdb): BPF_ATOMIC | BPF_DW | BPF_STX
imm (0x00000100): BPF_LOAD_ACQ
Similarly:
void bar(short *ptr, short val) {
__atomic_store_n(ptr, val, __ATOMIC_RELEASE);
}
bar() can be compiled to:
cb 21 00 00 10 01 00 00 store_release((u16 *)(r1 + 0x0), w2)
95 00 00 00 00 00 00 00 exit
opcode (0xcb): BPF_ATOMIC | BPF_H | BPF_STX
imm (0x00000110): BPF_STORE_REL
Inline assembly is also supported.
Add a pre-defined macro, __BPF_FEATURE_LOAD_ACQ_STORE_REL, to let
developers detect this new feature. It can also be disabled using a new
llc option, -disable-load-acq-store-rel.
Using __ATOMIC_RELAXED for __atomic_store{,_n}() will generate a "plain"
store (BPF_MEM | BPF_STX) instruction:
void foo(short *ptr, short val) {
__atomic_store_n(ptr, val, __ATOMIC_RELAXED);
}
6b 21 00 00 00 00 00 00 *(u16 *)(r1 + 0x0) = w2
95 00 00 00 00 00 00 00 exit
Similarly, using __ATOMIC_RELAXED for __atomic_load{,_n}() will generate
a zero-extending, "plain" load (BPF_MEM | BPF_LDX) instruction:
int foo(char *ptr) {
return __atomic_load_n(ptr, __ATOMIC_RELAXED);
}
71 11 00 00 00 00 00 00 w1 = *(u8 *)(r1 + 0x0)
bc 10 08 00 00 00 00 00 w0 = (s8)w1
95 00 00 00 00 00 00 00 exit
Currently __ATOMIC_CONSUME is an alias for __ATOMIC_ACQUIRE. Using
__ATOMIC_SEQ_CST ("sequentially consistent") is not supported yet and
will cause an error:
$ clang --target=bpf -mcpu=v4 -c bar.c > /dev/null
bar.c:1:5: error: sequentially consistent (seq_cst) atomic load/store is not supported
1 | int foo(int *ptr) { return __atomic_load_n(ptr, __ATOMIC_SEQ_CST); }
| ^
...
Finally, rename those isST*() and isLD*() helper functions in
BPFMISimplifyPatchable.cpp based on what the instructions actually do,
rather than their instruction class.
[1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/