From 3559ef2a366e0003693c5f980d160a82c0d45601 Mon Sep 17 00:00:00 2001 From: Manman Ren Date: Wed, 16 Dec 2015 21:04:19 +0000 Subject: [PATCH] CXX_FAST_TLS calling convention: performance improvement for AArch64. The access function has a short entry and a short exit, the initialization block is only run the first time. To improve the performance, we want to have a short frame at the entry and exit. We explicitly handle most of the CSRs via copies. Only the CSRs that are not handled via copies will be in CSR_SaveList. Frame lowering and prologue/epilogue insertion will generate a short frame in the entry and exit according to CSR_SaveList. The majority of the CSRs will be handled by register allcoator. Register allocator will try to spill and reload them in the initialization block. We add CSRsViaCopy, it will be explicitly handled during lowering. 1> we first set FunctionLoweringInfo->SplitCSR if conditions are met (the target supports it for the given machine function and the function has only return exits). We also call TLI->initializeSplitCSR to perform initialization. 2> we call TLI->insertCopiesSplitCSR to insert copies from CSRsViaCopy to virtual registers at beginning of the entry block and copies from virtual registers to CSRsViaCopy at beginning of the exit blocks. 3> we also need to make sure the explicit copies will not be eliminated. The target independent portion was committed as r255353. rdar://problem/23557469 Differential Revision: http://reviews.llvm.org/D15341 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255821 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/AArch64CallingConvention.td | 8 ++ lib/Target/AArch64/AArch64FastISel.cpp | 3 + lib/Target/AArch64/AArch64ISelLowering.cpp | 59 +++++++++++++ lib/Target/AArch64/AArch64ISelLowering.h | 8 ++ .../AArch64/AArch64MachineFunctionInfo.h | 13 ++- lib/Target/AArch64/AArch64RegisterInfo.cpp | 14 +++- lib/Target/AArch64/AArch64RegisterInfo.h | 2 + test/CodeGen/AArch64/cxx-tlscc.ll | 83 +++++++++---------- 8 files changed, 145 insertions(+), 45 deletions(-) diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 66d92100e63..388d64ec4e9 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -288,6 +288,14 @@ def CSR_AArch64_CXX_TLS_Darwin (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), (sequence "D%u", 0, 31))>; +// CSRs that are handled by prologue, epilogue. +def CSR_AArch64_CXX_TLS_Darwin_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_AArch64_CXX_TLS_Darwin_ViaCopy + : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. def CSR_AArch64_TLS_ELF diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index cae2d527629..0ac4b39b035 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -3646,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (F.isVarArg()) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector RetRegs; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4fe8565a06c..9f5beff1210 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3271,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } RetOps[0] = Chain; // Update chain. @@ -10003,3 +10016,49 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 51a3fee4791..e99616c9406 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -385,6 +385,14 @@ public: bool isCheapToSpeculateCtlz() const override { return true; } + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; private: bool isExtFreeImpl(const Instruction *Ext) const override; diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index cc0ccc2771f..318f8395350 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// registers. unsigned VarArgsFPRSize; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) { (void)MF; } @@ -96,6 +102,9 @@ public: bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 763b2337de1..32b4888f2f6 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -47,11 +48,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) - return CSR_AArch64_CXX_TLS_Darwin_SaveList; + return MF->getInfo()->isSplitCSR() ? + CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : + CSR_AArch64_CXX_TLS_Darwin_SaveList; else return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo()->isSplitCSR()) + return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index c01bfa5ea70..f33f788fd43 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -35,6 +35,8 @@ public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll index 39f6c0fbec9..a9ae00c8d27 100644 --- a/test/CodeGen/AArch64/cxx-tlscc.ll +++ b/test/CodeGen/AArch64/cxx-tlscc.ll @@ -13,7 +13,7 @@ declare %struct.S* @_ZN1SC1Ev(%struct.S* returned) declare %struct.S* @_ZN1SD1Ev(%struct.S* returned) declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*) -define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() { +define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind { %.b.i = load i1, i1* @__tls_guard, align 1 br i1 %.b.i, label %__tls_init.exit, label %init.i @@ -28,50 +28,49 @@ __tls_init.exit: } ; CHECK-LABEL: _ZTW2sg -; CHECK-DAG: stp d31, d30 -; CHECK-DAG: stp d29, d28 -; CHECK-DAG: stp d27, d26 -; CHECK-DAG: stp d25, d24 -; CHECK-DAG: stp d23, d22 -; CHECK-DAG: stp d21, d20 -; CHECK-DAG: stp d19, d18 -; CHECK-DAG: stp d17, d16 -; CHECK-DAG: stp d7, d6 -; CHECK-DAG: stp d5, d4 -; CHECK-DAG: stp d3, d2 -; CHECK-DAG: stp d1, d0 -; CHECK-DAG: stp x20, x19 -; CHECK-DAG: stp x14, x13 -; CHECK-DAG: stp x12, x11 -; CHECK-DAG: stp x10, x9 -; CHECK-DAG: stp x8, x7 -; CHECK-DAG: stp x6, x5 -; CHECK-DAG: stp x4, x3 -; CHECK-DAG: stp x2, x1 -; CHECK-DAG: stp x29, x30 +; CHECK-NOT: stp d31, d30 +; CHECK-NOT: stp d29, d28 +; CHECK-NOT: stp d27, d26 +; CHECK-NOT: stp d25, d24 +; CHECK-NOT: stp d23, d22 +; CHECK-NOT: stp d21, d20 +; CHECK-NOT: stp d19, d18 +; CHECK-NOT: stp d17, d16 +; CHECK-NOT: stp d7, d6 +; CHECK-NOT: stp d5, d4 +; CHECK-NOT: stp d3, d2 +; CHECK-NOT: stp d1, d0 +; CHECK-NOT: stp x20, x19 +; CHECK-NOT: stp x14, x13 +; CHECK-NOT: stp x12, x11 +; CHECK-NOT: stp x10, x9 +; CHECK-NOT: stp x8, x7 +; CHECK-NOT: stp x6, x5 +; CHECK-NOT: stp x4, x3 +; CHECK-NOT: stp x2, x1 ; CHECK: blr ; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]] ; CHECK: blr ; CHECK: tlv_atexit ; CHECK: [[BB_end]]: ; CHECK: blr -; CHECK-DAG: ldp x2, x1 -; CHECK-DAG: ldp x4, x3 -; CHECK-DAG: ldp x6, x5 -; CHECK-DAG: ldp x8, x7 -; CHECK-DAG: ldp x10, x9 -; CHECK-DAG: ldp x12, x11 -; CHECK-DAG: ldp x14, x13 -; CHECK-DAG: ldp x20, x19 -; CHECK-DAG: ldp d1, d0 -; CHECK-DAG: ldp d3, d2 -; CHECK-DAG: ldp d5, d4 -; CHECK-DAG: ldp d7, d6 -; CHECK-DAG: ldp d17, d16 -; CHECK-DAG: ldp d19, d18 -; CHECK-DAG: ldp d21, d20 -; CHECK-DAG: ldp d23, d22 -; CHECK-DAG: ldp d25, d24 -; CHECK-DAG: ldp d27, d26 -; CHECK-DAG: ldp d29, d28 -; CHECK-DAG: ldp d31, d30 +; CHECK-NOT: ldp x2, x1 +; CHECK-NOT: ldp x4, x3 +; CHECK-NOT: ldp x6, x5 +; CHECK-NOT: ldp x8, x7 +; CHECK-NOT: ldp x10, x9 +; CHECK-NOT: ldp x12, x11 +; CHECK-NOT: ldp x14, x13 +; CHECK-NOT: ldp x20, x19 +; CHECK-NOT: ldp d1, d0 +; CHECK-NOT: ldp d3, d2 +; CHECK-NOT: ldp d5, d4 +; CHECK-NOT: ldp d7, d6 +; CHECK-NOT: ldp d17, d16 +; CHECK-NOT: ldp d19, d18 +; CHECK-NOT: ldp d21, d20 +; CHECK-NOT: ldp d23, d22 +; CHECK-NOT: ldp d25, d24 +; CHECK-NOT: ldp d27, d26 +; CHECK-NOT: ldp d29, d28 +; CHECK-NOT: ldp d31, d30 -- 2.34.1