diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 1f4f8a6a17844..c8bffe741038a 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -545,6 +545,11 @@ class MCPlusBuilder { llvm_unreachable("not implemented"); } + virtual void createDirectBranch(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) { + llvm_unreachable("not implemented"); + } + virtual MCPhysReg getX86R11() const { llvm_unreachable("not implemented"); } virtual unsigned getShortBranchOpcode(unsigned Opcode) const { @@ -2399,7 +2404,7 @@ class MCPlusBuilder { virtual InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr, - int CallSiteID, MCContext *Ctx) { + size_t CallSiteID, MCContext *Ctx) { llvm_unreachable("not implemented"); return InstructionListType(); } diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 14125ae196b80..75436b6625f93 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -48,14 +48,14 @@ static cl::opt NoLSEAtomics( namespace { -static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { +[[maybe_unused]] static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { Inst.setOpcode(AArch64::MRS); Inst.clear(); Inst.addOperand(MCOperand::createReg(RegName)); Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); } -static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { +[[maybe_unused]] static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { Inst.setOpcode(AArch64::MSR); Inst.clear(); Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); @@ -2413,6 +2413,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return isCompAndBranch(Inst); } + void createDirectBranch(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + Inst.setOpcode(AArch64::B); + Inst.clear(); + Inst.addOperand(MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Target, *Ctx), *Ctx, 0))); + } + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, @@ -2770,21 +2778,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } InstructionListType createInstrumentedIndCallHandlerExitBB() const override { - InstructionListType Insts(5); // Code sequence for instrumented indirect call handler: - // msr nzcv, x1 - // ldp x0, x1, [sp], #16 - // ldr x16, [sp], #16 - // ldp x0, x1, [sp], #16 - // br x16 - setSystemFlag(Insts[0], AArch64::X1); - createPopRegisters(Insts[1], AArch64::X0, AArch64::X1); - // Here we load address of the next function which should be called in the - // original binary to X16 register. Writing to X16 is permitted without - // needing to restore. - loadReg(Insts[2], AArch64::X16, AArch64::SP); - createPopRegisters(Insts[3], AArch64::X0, AArch64::X1); - createIndirectBranch(Insts[4], AArch64::X16, 0); + // ret + + InstructionListType Insts; + + Insts.emplace_back(); + createReturn(Insts.back()); + return Insts; } @@ -2837,14 +2838,28 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType createLoadImmediate(const MCPhysReg Dest, uint64_t Imm) const override { - InstructionListType Insts(4); - int Shift = 48; - for (int I = 0; I < 4; I++, Shift -= 16) { - Insts[I].setOpcode(AArch64::MOVKXi); - Insts[I].addOperand(MCOperand::createReg(Dest)); - Insts[I].addOperand(MCOperand::createReg(Dest)); - Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF)); - Insts[I].addOperand(MCOperand::createImm(Shift)); + InstructionListType Insts; + + Insts.emplace_back(); + MCInst &Inst = Insts.back(); + Inst.clear(); + Inst.setOpcode(AArch64::MOVZXi); + Inst.addOperand(MCOperand::createReg(Dest)); + Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF)); + Inst.addOperand(MCOperand::createImm(0)); + + int Shift = 16; + for (int I = 0; I < 3; I++, Shift += 16) { + const uint64_t ImmVal = (Imm >> Shift) & 0xFFFF; + if (!ImmVal) + continue; + Insts.emplace_back(); + MCInst &Inst = Insts.back(); + Inst.setOpcode(AArch64::MOVKXi); + Inst.addOperand(MCOperand::createReg(Dest)); + Inst.addOperand(MCOperand::createReg(Dest)); + Inst.addOperand(MCOperand::createImm(ImmVal)); + Inst.addOperand(MCOperand::createImm(Shift)); } return Insts; } @@ -2858,41 +2873,48 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr, - int CallSiteID, + size_t CallSiteID, MCContext *Ctx) override { - InstructionListType Insts; // Code sequence used to enter indirect call instrumentation helper: - // stp x0, x1, [sp, #-16]! createPushRegisters - // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr - // mov x1 CallSiteID createLoadImmediate -> - // movk x1, #0x0, lsl #48 - // movk x1, #0x0, lsl #32 - // movk x1, #0x0, lsl #16 - // movk x1, #0x0 - // stp x0, x1, [sp, #-16]! - // bl *HandlerFuncAddr createIndirectCall -> - // adr x0 *HandlerFuncAddr -> adrp + add - // blr x0 + // snippet requires 2 registers: target address and call site id + // stp CallIDReg, x30, [sp, #-16]! + // movz/k CallIDReg, CallSiteID + // stp TAReg, CallIDReg, [sp, #-16]! ; push address and id for lib + // adr + add TAReg, *HandlerFuncAddr ; __bolt_instr_ind_call_handler_func + // blr TAReg + // ldr TAReg, [sp], #16 ; restore target address + // ldp CallIDReg, x30, [sp], #16 + // blr TAReg + + const MCRegister TAReg = CallInst.getOperand(0).getReg(); + const MCRegister CallIDReg = + TAReg != AArch64::X0 ? AArch64::X0 : AArch64::X1; + + InstructionListType Insts; Insts.emplace_back(); - createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); - Insts.emplace_back(CallInst); - convertIndirectCallToLoad(Insts.back(), AArch64::X0); - InstructionListType LoadImm = - createLoadImmediate(getIntArgRegister(1), CallSiteID); + createPushRegisters(Insts.back(), CallIDReg, AArch64::LR); + + InstructionListType LoadImm = createLoadImmediate(CallIDReg, CallSiteID); Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); + Insts.emplace_back(); - createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + createPushRegisters(Insts.back(), TAReg, CallIDReg); + Insts.resize(Insts.size() + 2); - InstructionListType Addr = - materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0); + InstructionListType Addr = materializeAddress(HandlerFuncAddr, Ctx, TAReg); assert(Addr.size() == 2 && "Invalid Addr size"); std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); + + Insts.emplace_back(); + createIndirectCallInst(Insts.back(), false, TAReg); + + Insts.emplace_back(); + loadReg(Insts.back(), TAReg, getStackPointer()); + Insts.emplace_back(); - createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0); + createPopRegisters(Insts.back(), CallIDReg, AArch64::LR); - // Carry over metadata including tail call marker if present. - stripAnnotations(Insts.back()); - moveAnnotations(std::move(CallInst), Insts.back()); + Insts.emplace_back(CallInst); return Insts; } @@ -2901,12 +2923,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, const MCSymbol *IndCallHandler, MCContext *Ctx) override { - // Code sequence used to check whether InstrTampoline was initialized + // Code sequence used to check whether InstrTrampoline was initialized // and call it if so, returns via IndCallHandler - // stp x0, x1, [sp, #-16]! - // mrs x1, nzcv - // adr x0, InstrTrampoline -> adrp + add - // ldr x0, [x0] + // adrp x0, InstrTrampoline + // ldr x0, [x0, #lo12:InstrTrampoline] // subs x0, x0, #0x0 // b.eq IndCallHandler // str x30, [sp, #-16]! @@ -2914,30 +2934,42 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // ldr x30, [sp], #16 // b IndCallHandler InstructionListType Insts; + + // load handler address + MCInst InstAdrp; + InstAdrp.setOpcode(AArch64::ADRP); + InstAdrp.addOperand(MCOperand::createReg(getIntArgRegister(0))); + InstAdrp.addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, InstrTrampoline, + /* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE); + Insts.emplace_back(InstAdrp); + + MCInst InstLoad; + InstLoad.setOpcode(AArch64::LDRXui); + InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0))); + InstLoad.addOperand(MCOperand::createReg(getIntArgRegister(0))); + InstLoad.addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(InstLoad, /* OpNum */ 2, InstrTrampoline, + /* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC); + Insts.emplace_back(InstLoad); + + InstructionListType CmpJmp = + createCmpJE(getIntArgRegister(0), 0, IndCallHandler, Ctx); + Insts.insert(Insts.end(), CmpJmp.begin(), CmpJmp.end()); + Insts.emplace_back(); - createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); - Insts.emplace_back(); - getSystemFlag(Insts.back(), getIntArgRegister(1)); - Insts.emplace_back(); - Insts.emplace_back(); - InstructionListType Addr = - materializeAddress(InstrTrampoline, Ctx, AArch64::X0); - std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); - assert(Addr.size() == 2 && "Invalid Addr size"); - Insts.emplace_back(); - loadReg(Insts.back(), AArch64::X0, AArch64::X0); - InstructionListType cmpJmp = - createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx); - Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); - Insts.emplace_back(); - storeReg(Insts.back(), AArch64::LR, AArch64::SP); + storeReg(Insts.back(), AArch64::LR, getStackPointer()); + Insts.emplace_back(); Insts.back().setOpcode(AArch64::BLR); - Insts.back().addOperand(MCOperand::createReg(AArch64::X0)); + Insts.back().addOperand(MCOperand::createReg(getIntArgRegister(0))); + Insts.emplace_back(); - loadReg(Insts.back(), AArch64::LR, AArch64::SP); + loadReg(Insts.back(), AArch64::LR, getStackPointer()); + Insts.emplace_back(); - createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); + createDirectBranch(Insts.back(), IndCallHandler, Ctx); + return Insts; } diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index d21bb628dcfcb..957768e5aaa29 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -821,7 +821,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder { InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr, - int CallSiteID, + size_t CallSiteID, MCContext *Ctx) override { // Code sequence used to enter indirect call instrumentation helper: // addi sp, sp, -0x10 diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 7c24c2ce136fa..51e7d27f18a0b 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -3123,7 +3123,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr, - int CallSiteID, + size_t CallSiteID, MCContext *Ctx) override { // Check if the target address expression used in the original indirect call // uses the stack pointer, which we are going to clobber. diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp index c0b8fc3d807c9..bb5930a6083f3 100644 --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -1696,7 +1696,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() #if defined(__aarch64__) // clang-format off __asm__ __volatile__(SAVE_ALL - "ldp x0, x1, [sp, #288]\n" + "ldp x0, x1, [sp, #272]\n" "bl instrumentIndirectCall\n" RESTORE_ALL "ret\n" @@ -1733,7 +1733,7 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() #if defined(__aarch64__) // clang-format off __asm__ __volatile__(SAVE_ALL - "ldp x0, x1, [sp, #288]\n" + "ldp x0, x1, [sp, #272]\n" "bl instrumentIndirectCall\n" RESTORE_ALL "ret\n" diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h index b1d04f9d558e0..9cb8e022f58df 100644 --- a/bolt/runtime/sys_aarch64.h +++ b/bolt/runtime/sys_aarch64.h @@ -18,10 +18,12 @@ "stp x24, x25, [sp, #-16]!\n" \ "stp x26, x27, [sp, #-16]!\n" \ "stp x28, x29, [sp, #-16]!\n" \ - "str x30, [sp,#-16]!\n" + "mrs x29, nzcv\n" \ + "stp x29, x30, [sp, #-16]!\n" // Mirrors SAVE_ALL #define RESTORE_ALL \ - "ldr x30, [sp], #16\n" \ + "ldp x29, x30, [sp], #16\n" \ + "msr nzcv, x29\n" \ "ldp x28, x29, [sp], #16\n" \ "ldp x26, x27, [sp], #16\n" \ "ldp x24, x25, [sp], #16\n" \ diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c index f9056da333b4e..cab0dc0ac2bbc 100644 --- a/bolt/test/runtime/AArch64/instrumentation-ind-call.c +++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c @@ -4,20 +4,96 @@ typedef int (*func_ptr)(int, int); int add(int a, int b) { return a + b; } +int getConst(int a, int b) { return 0xaa55; } + +void foo() { + // clang-format off + __asm__ __volatile("stp x29, x30 [sp, #-16]!\n" + "adrp x0, getConst\n" + "add x0, x0, :lo12:getConst\n" + "blr x0\n" + "ldp x29, x30 [sp], #16\n" + :::); + // clang-format on + return; +} + int main() { func_ptr fun; fun = add; int sum = fun(10, 20); // indirect call to 'add' printf("The sum is: %d\n", sum); + foo(); return 0; } /* REQUIRES: system-linux,bolt-runtime RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie -fpie +RUN: llvm-objdump --disassemble-symbols=main %t.exe \ +RUN: | FileCheck %s --check-prefix=CHECKINDIRECTREG + +CHECKINDIRECTREG: mov w0, #0xa +CHECKINDIRECTREG-NEXT: mov w1, #0x14 +CHECKINDIRECTREG-NEXT: blr x8 RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ -RUN: -o %t.instrumented +RUN: -o %t.instrumented \ +RUN: | FileCheck %s --check-prefix=CHECK-LOG + +CHECK-LOG-NOT: BOLT-INSTRUMENTER: Number of indirect call site descriptors: 0 + +RUN: llvm-objdump --disassemble-symbols=main %t.instrumented \ +RUN: | FileCheck %s --check-prefix=CHECK-MAIN + +RUN: llvm-objdump --disassemble-symbols=foo %t.instrumented \ +RUN: | FileCheck %s --check-prefix=CHECK-FOO + +RUN: llvm-objdump --disassemble-symbols=__bolt_instr_ind_call_handler \ +RUN: %t.instrumented | FileCheck %s --check-prefix=CHECK-INSTR-INDIR-CALL +RUN: llvm-objdump --disassemble-symbols=__bolt_instr_ind_call_handler_func \ +RUN: %t.instrumented | FileCheck %s --check-prefix=CHECK-INSTR-INDIR-CALL-FUNC + +CHECK-MAIN: mov w0, #0xa +CHECK-MAIN-NEXT: mov w1, #0x14 +// store current values +CHECK-MAIN-NEXT: stp x0, x30, [sp +// load callsite id +CHECK-MAIN-NEXT: mov x0, #0x0 +CHECK-MAIN-NEXT: stp x8, x0, [sp +CHECK-MAIN-NEXT: adrp x8, +CHECK-MAIN-NEXT: add x8, x8 +// call instrumentation library handler function +CHECK-MAIN-NEXT: blr x8 +// restore registers saved before +CHECK-MAIN-NEXT: ldr x8, [sp] +CHECK-MAIN-NEXT: ldp x0, x30, [sp] +// original indirect call instruction +CHECK-MAIN-NEXT: blr x8 + +// instrumented pattern for blr x0 +CHECK-FOO: stp x1, x30, [sp +CHECK-FOO-NEXT: mov x1, +CHECK-FOO-NEXT: stp x0, x1, [sp +CHECK-FOO-NEXT: adrp x0 +CHECK-FOO-NEXT: add x0, x0 +CHECK-FOO-NEXT: blr x0 +CHECK-FOO-NEXT: ldr x0, [sp] +CHECK-FOO-NEXT: ldp x1, x30, [sp] +CHECK-FOO-NEXT: blr x0 + +CHECK-INSTR-INDIR-CALL: __bolt_instr_ind_call_handler>: +CHECK-INSTR-INDIR-CALL-NEXT: ret + +CHECK-INSTR-INDIR-CALL-FUNC: __bolt_instr_ind_call_handler_func>: +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: adrp x0 +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: ldr x0 +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: cmp x0, #0x0 +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: b.eq{{.*}}__bolt_instr_ind_call_handler +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: str x30 +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: blr x0 +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: ldr x30 +CHECK-INSTR-INDIR-CALL-FUNC-NEXT: b{{.*}}__bolt_instr_ind_call_handler # Instrumented program needs to finish returning zero RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT @@ -25,7 +101,7 @@ RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT # Test that the instrumented data makes sense RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --print-only=main --print-finalized | FileCheck %s +RUN: --print-only=main,foo --print-finalized | FileCheck %s RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT @@ -33,6 +109,11 @@ CHECK-OUTPUT: The sum is: 30 # Check that our indirect call has 1 hit recorded in the fdata file and that # this was processed correctly by BOLT +CHECK-LABEL: Binary Function "foo" +CHECK: blr x0 # CallProfile: 1 (0 misses) : +CHECK-NEXT: { getConst: 1 (0 misses) } + +CHECK-LABEL: Binary Function "main" CHECK: blr x8 # CallProfile: 1 (0 misses) : CHECK-NEXT: { add: 1 (0 misses) } */