return false;
}
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) const {
+MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL,
+ bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR()) {
+ if (InProlog) {
+ return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+ } else {
+ return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+ }
+ } else {
+ return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+ }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const {
+ const StringRef ChkStkStubSymbol = "__chkstk_stub";
+ MachineInstr *ChkStkStub = nullptr;
+
+ for (MachineInstr &MI : PrologMBB) {
+ if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+ ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+ ChkStkStub = &MI;
+ break;
+ }
+ }
+
+ if (ChkStkStub != nullptr) {
+ MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+ assert(std::prev(MBBI).operator==(ChkStkStub) &&
+ "MBBI expected after __chkstk_stub.");
+ DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+ emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+ ChkStkStub->eraseFromParent();
+ }
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInline(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ assert(STI.is64Bit() && "different expansion needed for 32 bit");
+ assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ // RAX contains the number of bytes of desired stack adjustment.
+ // The handling here assumes this value has already been updated so as to
+ // maintain stack alignment.
+ //
+ // We need to exit with RSP modified by this amount and execute suitable
+ // page touches to notify the OS that we're growing the stack responsibly.
+ // All stack probing must be done without modifying RSP.
+ //
+ // MBB:
+ // SizeReg = RAX;
+ // ZeroReg = 0
+ // CopyReg = RSP
+ // Flags, TestReg = CopyReg - SizeReg
+ // FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+ // LimitReg = gs magic thread env access
+ // if FinalReg >= LimitReg goto ContinueMBB
+ // RoundBB:
+ // RoundReg = page address of FinalReg
+ // LoopMBB:
+ // LoopReg = PHI(LimitReg,ProbeReg)
+ // ProbeReg = LoopReg - PageSize
+ // [ProbeReg] = 0
+ // if (ProbeReg > RoundReg) goto LoopMBB
+ // ContinueMBB:
+ // RSP = RSP - RAX
+ // [rest of original MBB]
+
+ // Set up the new basic blocks
+ MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+ MF.insert(MBBIter, RoundMBB);
+ MF.insert(MBBIter, LoopMBB);
+ MF.insert(MBBIter, ContinueMBB);
+
+ // Split MBB and move the tail portion down to ContinueMBB.
+ MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+ ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+ ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ // Some useful constants
+ const int64_t ThreadEnvironmentStackLimit = 0x10;
+ const int64_t PageSize = 0x1000;
+ const int64_t PageMask = ~(PageSize - 1);
+
+ // Registers we need. For the normal case we use virtual
+ // registers. For the prolog expansion we use RAX, RCX and RDX.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+ const unsigned
+ SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
+ ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+ CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+ TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+ FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+ RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+ LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+ JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+ ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
+
+ // SP-relative offsets where we can save RCX and RDX.
+ int64_t RCXShadowSlot = 0;
+ int64_t RDXShadowSlot = 0;
+
+ // If inlining in the prolog, save RCX and RDX.
+ // Future optimization: don't save or restore if not live in.
+ if (InProlog) {
+ // Compute the offsets. We need to account for things already
+ // pushed onto the stack at this point: return address, frame
+ // pointer (if used), and callee saves.
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+ const bool HasFP = hasFP(MF);
+ RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+ RDXShadowSlot = RCXShadowSlot + 8;
+ // Emit the saves.
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RCXShadowSlot)
+ .addReg(X86::RCX);
+ addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+ RDXShadowSlot)
+ .addReg(X86::RDX);
+ } else {
+ // Not in the prolog. Copy RAX to a virtual reg.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+ }
+
+ // Add code to MBB to check for overflow and set the new target stack pointer
+ // to zero if so.
+ BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+ .addReg(ZeroReg, RegState::Undef)
+ .addReg(ZeroReg, RegState::Undef);
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+ BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+ .addReg(CopyReg)
+ .addReg(SizeReg);
+ BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+ .addReg(TestReg)
+ .addReg(ZeroReg);
+
+ // FinalReg now holds final stack pointer value, or zero if
+ // allocation would overflow. Compare against the current stack
+ // limit from the thread environment block. Note this limit is the
+ // lowest touched page on the stack, not the point at which the OS
+ // will cause an overflow exception, so this is just an optimization
+ // to avoid unnecessarily touching pages that are below the current
+ // SP but already commited to the stack by the OS.
+ BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+ .addReg(0)
+ .addImm(1)
+ .addReg(0)
+ .addImm(ThreadEnvironmentStackLimit)
+ .addReg(X86::GS);
+ BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+ // Jump if the desired stack pointer is at or above the stack limit.
+ BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+ // Add code to roundMBB to round the final stack pointer to a page boundary.
+ BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+ .addReg(FinalReg)
+ .addImm(PageMask);
+ BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+ // LimitReg now holds the current stack limit, RoundedReg page-rounded
+ // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+ // and probe until we reach RoundedReg.
+ if (!InProlog) {
+ BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+ .addReg(LimitReg)
+ .addMBB(RoundMBB)
+ .addReg(ProbeReg)
+ .addMBB(LoopMBB);
+ }
+
+ addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+ false, -PageSize);
+
+ // Probe by storing a byte onto the stack.
+ BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+ .addReg(ProbeReg)
+ .addImm(1)
+ .addReg(0)
+ .addImm(0)
+ .addReg(0)
+ .addImm(0);
+ BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+ .addReg(RoundedReg)
+ .addReg(ProbeReg);
+ BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+ MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+ // If in prolog, restore RDX and RCX.
+ if (InProlog) {
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RCX),
+ X86::RSP, false, RCXShadowSlot);
+ addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+ X86::RDX),
+ X86::RSP, false, RDXShadowSlot);
+ }
+
+ // Now that the probing is done, add code to continueMBB to update
+ // the stack pointer for real.
+ BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+ .addReg(X86::RSP)
+ .addReg(SizeReg);
+
+ // Add the control flow edges we need.
+ MBB.addSuccessor(ContinueMBB);
+ MBB.addSuccessor(RoundMBB);
+ RoundMBB->addSuccessor(LoopMBB);
+ LoopMBB->addSuccessor(ContinueMBB);
+ LoopMBB->addSuccessor(LoopMBB);
+
+ // Mark all the instructions added to the prolog as frame setup.
+ if (InProlog) {
+ for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+ BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *RoundMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineInstr &MI : *LoopMBB) {
+ MI.setFlag(MachineInstr::FrameSetup);
+ }
+ for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+ CMBBI != ContinueMBBI; ++CMBBI) {
+ CMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+ }
+
+ // Possible TODO: physreg liveness for InProlog case.
+
+ return ContinueMBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeCall(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
unsigned CallOp;
Symbol = "_chkstk";
MachineInstrBuilder CI;
+ MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
// All current stack probes take AX and SP as input, clobber flags, and
// preserve all registers. x86_64 probes leave RSP unmodified.
.addReg(X86::RSP)
.addReg(X86::RAX);
}
+
+ if (InProlog) {
+ // Apply the frame setup flag to all inserted instrs.
+ for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+ ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+ }
+
+ return MBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+
+ assert(InProlog && "ChkStkStub called outside prolog!");
+
+ MachineInstrBuilder CI = BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("__chkstk_stub");
+
+ return MBBI;
}
static unsigned calculateSetFPREG(uint64_t SPAdjust) {
// Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
// We'll also use 4 already allocated bytes for EAX.
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
- .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
+ .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+ .setMIFlag(MachineInstr::FrameSetup);
}
- // Save a pointer to the MI where we set AX.
- MachineBasicBlock::iterator SetRAX = MBBI;
- --SetRAX;
-
// Call __chkstk, __chkstk_ms, or __alloca.
- emitStackProbeCall(MF, MBB, MBBI, DL);
-
- // Apply the frame setup flag to all inserted instrs.
- for (; SetRAX != MBBI; ++SetRAX)
- SetRAX->setFlag(MachineInstr::FrameSetup);
+ emitStackProbe(MF, MBB, MBBI, DL, true);
if (isEAXAlive) {
// Restore EAX
- MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
- X86::EAX),
- StackPtr, false, NumBytes - 4);
+ MachineInstr *MI =
+ addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+ StackPtr, false, NumBytes - 4);
MI->setFlag(MachineInstr::FrameSetup);
MBB.insert(MBBI, MI);
}
unsigned StackPtr;
- /// Emit a call to the target's stack probe function. This is required for all
+ /// Emit target stack probe code. This is required for all
/// large stack allocations on Windows. The caller is required to materialize
- /// the number of bytes to probe in RAX/EAX.
- void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL) const;
+ /// the number of bytes to probe in RAX/EAX. Returns instruction just
+ /// after the expansion.
+ MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ bool InProlog) const;
+
+ /// Replace a StackProbe inline-stub with the actual probe code inline.
+ void inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologMBB) const override;
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
/// \p MBB will be correctly handled by the target.
bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+ /// convertArgMovsToPushes - This method tries to convert a call sequence
+ /// that uses sub and mov instructions to put the argument onto the stack
+ /// into a series of pushes.
+ /// Returns true if the transformation succeeded, false if not.
+ bool convertArgMovsToPushes(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ uint64_t Amount) const;
+
/// Wraps up getting a CFI index and building a MachineInstr for it.
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
DebugLoc DL, MCCFIInstruction CFIInst) const;
private:
uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+ /// Emit target stack probe as a call to a helper function
+ MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Emit target stack probe as an inline sequence.
+ MachineInstr *emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
+ /// Emit a stub to later inline the target stack probe.
+ MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, bool InProlog) const;
+
/// Aligns the stack pointer by ANDing it with -MaxAlign.
void BuildStackAlignAND(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc DL,
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -check-prefix=LINUX
+
+; By default, windows CoreCLR requires an inline prologue stack expansion check
+; if more than 4096 bytes are allocated on the stack.
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+define i32 @main4k() nounwind {
+entry:
+; WIN_X64-LABEL:main4k:
+; WIN_X64: # BB#0:
+; WIN_X64: movl $4096, %eax
+; WIN_X64: movq %rcx, 8(%rsp)
+; WIN_X64: movq %rdx, 16(%rsp)
+; WIN_X64: xorq %rcx, %rcx
+; WIN_X64: movq %rsp, %rdx
+; WIN_X64: subq %rax, %rdx
+; WIN_X64: cmovbq %rcx, %rdx
+; WIN_X64: movq %gs:16, %rcx
+; WIN_X64: cmpq %rcx, %rdx
+; WIN_X64: jae .LBB0_3
+; WIN_X64:# BB#1:
+; WIN_X64: andq $-4096, %rdx
+; WIN_X64:.LBB0_2:
+; WIN_X64: leaq -4096(%rcx), %rcx
+; WIN_X64: movb $0, (%rcx)
+; WIN_X64: cmpq %rcx, %rdx
+; WIN_X64: jne .LBB0_2
+; WIN_X64:.LBB0_3:
+; WIN_X64: movq 8(%rsp), %rcx
+; WIN_X64: movq 16(%rsp), %rdx
+; WIN_X64: subq %rax, %rsp
+; WIN_X64: xorl %eax, %eax
+; WIN_X64: addq $4096, %rsp
+; WIN_X64: retq
+; LINUX-LABEL:main4k:
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with frame pointer
+define i32 @main4k_frame() nounwind "no-frame-pointer-elim"="true" {
+entry:
+; WIN_X64-LABEL:main4k_frame:
+; WIN_X64: movq %rcx, 16(%rsp)
+; WIN_X64: movq %gs:16, %rcx
+; LINUX-LABEL:main4k_frame:
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with INT args
+define i32 @main4k_intargs(i32 %x, i32 %y) nounwind {
+entry:
+; WIN_X64: movq %rcx, 8(%rsp)
+; WIN_X64: movq %gs:16, %rcx
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ %t = add i32 %x, %y
+ ret i32 %t
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with FP regs
+define i32 @main4k_fpargs(double %x, double %y) nounwind {
+entry:
+; WIN_X64: movq %rcx, 8(%rsp)
+; WIN_X64: movq %gs:16, %rcx
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with mixed regs
+define i32 @main4k_mixargs(double %x, i32 %y) nounwind {
+entry:
+; WIN_X64: movq %gs:16, %rcx
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ ret i32 %y
+}
+
+; Make sure we don't emit the probe for a smaller prolog stack allocation.
+define i32 @main128() nounwind {
+entry:
+; WIN_X64-NOT: movq %gs:16, %rcx
+; WIN_X64: retq
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [128 x i8]
+ ret i32 0
+}
+
+; Make sure we don't emit the probe sequence if not on windows even if the
+; caller has the Win64 calling convention.
+define x86_64_win64cc i32 @main4k_win64() nounwind {
+entry:
+; WIN_X64: movq %gs:16, %rcx
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ ret i32 0
+}
+
+declare i32 @bar(i8*) nounwind
+
+; Within-body inline probe expansion
+define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind {
+entry:
+; WIN_X64: callq bar
+; WIN_X64: movq %gs:16, [[R:%r.*]]
+; WIN_X64: callq bar
+; LINUX: callq bar
+; LINUX-NOT: movq %gs:16, [[R:%r.*]]
+; LINUX: callq bar
+ %a = alloca i8, i64 1024
+ %ra = call i32 @bar(i8* %a) nounwind
+ %b = alloca i8, i64 %n
+ %rb = call i32 @bar(i8* %b) nounwind
+ %r = add i32 %ra, %rb
+ ret i32 %r
+}
+
+; Influence of stack-probe-size attribute
+; Note this is not exposed in coreclr
+define i32 @test_probe_size() "stack-probe-size"="8192" nounwind {
+; WIN_X64-NOT: movq %gs:16, %rcx
+; WIN_X64: retq
+; LINUX-NOT: movq %gs:16, %rcx
+; LINUX: retq
+ %a = alloca [4096 x i8]
+ ret i32 0
+}