Support for emitting inline stack probes

author Andy Ayers <andya@microsoft.com>

Tue, 10 Nov 2015 01:50:49 +0000 (01:50 +0000)

committer Andy Ayers <andya@microsoft.com>

Tue, 10 Nov 2015 01:50:49 +0000 (01:50 +0000)
author Andy Ayers <andya@microsoft.com>
Tue, 10 Nov 2015 01:50:49 +0000 (01:50 +0000)
committer Andy Ayers <andya@microsoft.com>
Tue, 10 Nov 2015 01:50:49 +0000 (01:50 +0000)
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h

index 398c91ef5624947708928bb86277c019bf5d04d9..cadd07d71f12d8a46bc04710518ded1657277985 100644 (file)
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -158,6 +158,10 @@ public:
    virtual void emitEpilogue(MachineFunction &MF,
                              MachineBasicBlock &MBB) const = 0;
  
+  /// Replace a StackProbe stub (if any) with the actual probe code inline
+  virtual void inlineStackProbe(MachineFunction &MF,
+                                MachineBasicBlock &PrologueMBB) const {}
+
    /// Adjust the prologue to have the function use segmented stacks. This works
    /// by adding a check even before the "normal" function prologue.
    virtual void adjustForSegmentedStacks(MachineFunction &MF,
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp

index 13e753692de417c4b64bef24f31290be1491d334..939c50027b0224af47bdac1b51a8182028cec9b4 100644 (file)
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -781,6 +781,9 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
    for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
      TFI.emitEpilogue(Fn, *RestoreBlock);
  
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.inlineStackProbe(Fn, *SaveBlock);
+
    // Emit additional code that is required to support segmented stacks, if
    // we've been asked for it.  This, when linked with a runtime with support
    // for segmented stacks (libgcc is one), will result in allocating stack
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp

index 875c63c4f6f80f99fb77bfbb85bc98b55a0d7f99..437e671dc99b9c16e32a4a8dd7b957e936579a34 100644 (file)
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -431,10 +431,257 @@ static bool usesTheStack(const MachineFunction &MF) {
    return false;
  }
  
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
-                                          MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL) const {
+MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               DebugLoc DL,
+                                               bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.isTargetWindowsCoreCLR()) {
+    if (InProlog) {
+      return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+    } else {
+      return emitStackProbeInline(MF, MBB, MBBI, DL, false);
+    }
+  } else {
+    return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+  }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                        MachineBasicBlock &PrologMBB) const {
+  const StringRef ChkStkStubSymbol = "__chkstk_stub";
+  MachineInstr *ChkStkStub = nullptr;
+
+  for (MachineInstr &MI : PrologMBB) {
+    if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+        ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+      ChkStkStub = &MI;
+      break;
+    }
+  }
+
+  if (ChkStkStub != nullptr) {
+    MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+    assert(std::prev(MBBI).operator==(ChkStkStub) &&
+      "MBBI expected after __chkstk_stub.");
+    DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+    emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+    ChkStkStub->eraseFromParent();
+  }
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInline(
+  MachineFunction &MF, MachineBasicBlock &MBB,
+  MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  assert(STI.is64Bit() && "different expansion needed for 32 bit");
+  assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  // RAX contains the number of bytes of desired stack adjustment.
+  // The handling here assumes this value has already been updated so as to
+  // maintain stack alignment.
+  //
+  // We need to exit with RSP modified by this amount and execute suitable
+  // page touches to notify the OS that we're growing the stack responsibly.
+  // All stack probing must be done without modifying RSP.
+  //
+  // MBB:
+  //    SizeReg = RAX;
+  //    ZeroReg = 0
+  //    CopyReg = RSP
+  //    Flags, TestReg = CopyReg - SizeReg
+  //    FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+  //    LimitReg = gs magic thread env access
+  //    if FinalReg >= LimitReg goto ContinueMBB
+  // RoundBB:
+  //    RoundReg = page address of FinalReg
+  // LoopMBB:
+  //    LoopReg = PHI(LimitReg,ProbeReg)
+  //    ProbeReg = LoopReg - PageSize
+  //    [ProbeReg] = 0
+  //    if (ProbeReg > RoundReg) goto LoopMBB
+  // ContinueMBB:
+  //    RSP = RSP - RAX
+  //    [rest of original MBB]
+
+  // Set up the new basic blocks
+  MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+  MF.insert(MBBIter, RoundMBB);
+  MF.insert(MBBIter, LoopMBB);
+  MF.insert(MBBIter, ContinueMBB);
+
+  // Split MBB and move the tail portion down to ContinueMBB.
+  MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+  ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+  ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  // Some useful constants
+  const int64_t ThreadEnvironmentStackLimit = 0x10;
+  const int64_t PageSize = 0x1000;
+  const int64_t PageMask = ~(PageSize - 1);
+
+  // Registers we need. For the normal case we use virtual
+  // registers. For the prolog expansion we use RAX, RCX and RDX.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+  const unsigned
+      SizeReg = InProlog ? X86::RAX : MRI.createVirtualRegister(RegClass),
+      ZeroReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+      CopyReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      TestReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      FinalReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      RoundedReg = InProlog ? X86::RDX : MRI.createVirtualRegister(RegClass),
+      LimitReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+      JoinReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass),
+      ProbeReg = InProlog ? X86::RCX : MRI.createVirtualRegister(RegClass);
+
+  // SP-relative offsets where we can save RCX and RDX.
+  int64_t RCXShadowSlot = 0;
+  int64_t RDXShadowSlot = 0;
+
+  // If inlining in the prolog, save RCX and RDX.     
+  // Future optimization: don't save or restore if not live in.
+  if (InProlog) {
+    // Compute the offsets. We need to account for things already
+    // pushed onto the stack at this point: return address, frame
+    // pointer (if used), and callee saves.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+    const bool HasFP = hasFP(MF);
+    RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+    RDXShadowSlot = RCXShadowSlot + 8;
+    // Emit the saves.
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RCXShadowSlot)
+        .addReg(X86::RCX);
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RDXShadowSlot)
+        .addReg(X86::RDX);
+  } else {
+    // Not in the prolog. Copy RAX to a virtual reg.
+    BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+  }
+
+  // Add code to MBB to check for overflow and set the new target stack pointer
+  // to zero if so.
+  BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+      .addReg(ZeroReg, RegState::Undef)
+      .addReg(ZeroReg, RegState::Undef);
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+  BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+      .addReg(CopyReg)
+      .addReg(SizeReg);
+  BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+      .addReg(TestReg)
+      .addReg(ZeroReg);
+
+  // FinalReg now holds final stack pointer value, or zero if
+  // allocation would overflow. Compare against the current stack
+  // limit from the thread environment block. Note this limit is the
+  // lowest touched page on the stack, not the point at which the OS
+  // will cause an overflow exception, so this is just an optimization
+  // to avoid unnecessarily touching pages that are below the current
+  // SP but already commited to the stack by the OS.
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+      .addReg(0)
+      .addImm(1)
+      .addReg(0)
+      .addImm(ThreadEnvironmentStackLimit)
+      .addReg(X86::GS);
+  BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+  // Jump if the desired stack pointer is at or above the stack limit.
+  BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+  // Add code to roundMBB to round the final stack pointer to a page boundary.
+  BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+      .addReg(FinalReg)
+      .addImm(PageMask);
+  BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+  // LimitReg now holds the current stack limit, RoundedReg page-rounded
+  // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+  // and probe until we reach RoundedReg.
+  if (!InProlog) {
+    BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+        .addReg(LimitReg)
+        .addMBB(RoundMBB)
+        .addReg(ProbeReg)
+        .addMBB(LoopMBB);
+  }
+
+  addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+               false, -PageSize);
+
+  // Probe by storing a byte onto the stack.
+  BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+      .addReg(ProbeReg)
+      .addImm(1)
+      .addReg(0)
+      .addImm(0)
+      .addReg(0)
+      .addImm(0);
+  BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+      .addReg(RoundedReg)
+      .addReg(ProbeReg);
+  BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+  MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+  // If in prolog, restore RDX and RCX.
+  if (InProlog) {
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RCX),
+                 X86::RSP, false, RCXShadowSlot);
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RDX),
+                 X86::RSP, false, RDXShadowSlot);
+  }
+
+  // Now that the probing is done, add code to continueMBB to update
+  // the stack pointer for real.
+  BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+      .addReg(X86::RSP)
+      .addReg(SizeReg);
+
+  // Add the control flow edges we need.
+  MBB.addSuccessor(ContinueMBB);
+  MBB.addSuccessor(RoundMBB);
+  RoundMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(ContinueMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+
+  // Mark all the instructions added to the prolog as frame setup.
+  if (InProlog) {
+    for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+      BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *RoundMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *LoopMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+         CMBBI != ContinueMBBI; ++CMBBI) {
+      CMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  // Possible TODO: physreg liveness for InProlog case.
+
+  return ContinueMBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeCall(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
    bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
  
    unsigned CallOp;
@@ -456,6 +703,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
      Symbol = "_chkstk";
  
    MachineInstrBuilder CI;
+  MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
  
    // All current stack probes take AX and SP as input, clobber flags, and
    // preserve all registers. x86_64 probes leave RSP unmodified.
@@ -485,6 +733,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
          .addReg(X86::RSP)
          .addReg(X86::RAX);
    }
+
+  if (InProlog) {
+    // Apply the frame setup flag to all inserted instrs.
+    for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+      ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+  }
+
+  return MBBI;
+}
+
+MachineInstr *X86FrameLowering::emitStackProbeInlineStub(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const {
+
+  assert(InProlog && "ChkStkStub called outside prolog!");
+
+  MachineInstrBuilder CI = BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+                               .addExternalSymbol("__chkstk_stub");
+
+  return MBBI;
  }
  
  static unsigned calculateSetFPREG(uint64_t SPAdjust) {
@@ -893,26 +1161,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
        // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
        // We'll also use 4 already allocated bytes for EAX.
        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
-        .setMIFlag(MachineInstr::FrameSetup);
+          .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+          .setMIFlag(MachineInstr::FrameSetup);
      }
  
-    // Save a pointer to the MI where we set AX.
-    MachineBasicBlock::iterator SetRAX = MBBI;
-    --SetRAX;
-
      // Call __chkstk, __chkstk_ms, or __alloca.
-    emitStackProbeCall(MF, MBB, MBBI, DL);
-
-    // Apply the frame setup flag to all inserted instrs.
-    for (; SetRAX != MBBI; ++SetRAX)
-      SetRAX->setFlag(MachineInstr::FrameSetup);
+    emitStackProbe(MF, MBB, MBBI, DL, true);
  
      if (isEAXAlive) {
        // Restore EAX
-      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
-                                              X86::EAX),
-                                      StackPtr, false, NumBytes - 4);
+      MachineInstr *MI =
+          addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+                       StackPtr, false, NumBytes - 4);
        MI->setFlag(MachineInstr::FrameSetup);
        MBB.insert(MBBI, MI);
      }
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h

index 6975d250296befdd7e752cd4a5f74ca81cd90b70..db32be129248b79af2e2839997249c5c1bf306e3 100644 (file)
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -47,11 +47,17 @@ public:
  
    unsigned StackPtr;
  
-  /// Emit a call to the target's stack probe function. This is required for all
+  /// Emit target stack probe code. This is required for all
    /// large stack allocations on Windows. The caller is required to materialize
-  /// the number of bytes to probe in RAX/EAX.
-  void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI, DebugLoc DL) const;
+  /// the number of bytes to probe in RAX/EAX. Returns instruction just
+  /// after the expansion.
+  MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                               bool InProlog) const;
+
+  /// Replace a StackProbe inline-stub with the actual probe code inline.
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
  
    void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI,
@@ -125,6 +131,15 @@ public:
    /// \p MBB will be correctly handled by the target.
    bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
  
+  /// convertArgMovsToPushes - This method tries to convert a call sequence
+  /// that uses sub and mov instructions to put the argument onto the stack
+  /// into a series of pushes.
+  /// Returns true if the transformation succeeded, false if not.
+  bool convertArgMovsToPushes(MachineFunction &MF, 
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, 
+                              uint64_t Amount) const;
+
    /// Wraps up getting a CFI index and building a MachineInstr for it.
    void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                  DebugLoc DL, MCCFIInstruction CFIInst) const;
@@ -139,6 +154,23 @@ public:
  private:
    uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
  
+  /// Emit target stack probe as a call to a helper function
+  MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc DL, bool InProlog) const;
+
+  /// Emit target stack probe as an inline sequence.
+  MachineInstr *emitStackProbeInline(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, bool InProlog) const;
+
+  /// Emit a stub to later inline the target stack probe.
+  MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         DebugLoc DL, bool InProlog) const;
+
    /// Aligns the stack pointer by ANDing it with -MaxAlign.
    void BuildStackAlignAND(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, DebugLoc DL,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index f5b38b396965671b64b26433ca8ab67083a78d19..f23b5656f88828054e4d7f1e05f037e79ee23fc6 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -21380,15 +21380,13 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
  MachineBasicBlock *
  X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                          MachineBasicBlock *BB) const {
-  DebugLoc DL = MI->getDebugLoc();
-
    assert(!Subtarget->isTargetMachO());
-
-  Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI,
-                                                    DL);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return BB;
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe(
+      *BB->getParent(), *BB, MI, DL, false);
+  MachineBasicBlock *ResumeBB = ResumeMI->getParent();
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return ResumeBB;
  }
  
  MachineBasicBlock *
diff --git a/test/CodeGen/X86/stack-probe-size.ll b/test/CodeGen/X86/stack-probe-size.ll

index 43c62c113f87d5a020cd1e8f61c6dc23e1fc27a2..4d1f88d111726f7629947d0245527f441879621a 100644 (file)
--- a/test/CodeGen/X86/stack-probe-size.ll
+++ b/test/CodeGen/X86/stack-probe-size.ll
@@ -7,7 +7,6 @@
  ; this is unlikely to change in the future.
  ;
  ; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
-; RUN: llc -mtriple=i686-windows-coreclr < %s | FileCheck %s
  
  target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
  
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll

new file mode 100644 (file)

index 0000000..c9a5fc2
--- /dev/null
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64
+; RUN: llc < %s -mtriple=x86_64-pc-linux         | FileCheck %s -check-prefix=LINUX
+
+; By default, windows CoreCLR requires an inline prologue stack expansion check
+; if more than 4096 bytes are allocated on the stack.
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+define i32 @main4k() nounwind {
+entry:
+; WIN_X64-LABEL:main4k:
+; WIN_X64: # BB#0:
+; WIN_X64:      movl    $4096, %eax
+; WIN_X64:      movq    %rcx, 8(%rsp)
+; WIN_X64:     movq    %rdx, 16(%rsp)
+; WIN_X64:     xorq    %rcx, %rcx
+; WIN_X64:     movq    %rsp, %rdx
+; WIN_X64:     subq    %rax, %rdx
+; WIN_X64:     cmovbq  %rcx, %rdx
+; WIN_X64:     movq    %gs:16, %rcx
+; WIN_X64:     cmpq    %rcx, %rdx
+; WIN_X64:     jae     .LBB0_3
+; WIN_X64:# BB#1:
+; WIN_X64:     andq    $-4096, %rdx
+; WIN_X64:.LBB0_2:
+; WIN_X64:     leaq    -4096(%rcx), %rcx
+; WIN_X64:     movb    $0, (%rcx)
+; WIN_X64:     cmpq    %rcx, %rdx
+; WIN_X64:     jne     .LBB0_2
+; WIN_X64:.LBB0_3:
+; WIN_X64:     movq    8(%rsp), %rcx
+; WIN_X64:     movq    16(%rsp), %rdx
+; WIN_X64:     subq    %rax, %rsp
+; WIN_X64:     xorl    %eax, %eax
+; WIN_X64:     addq    $4096, %rsp
+; WIN_X64:     retq
+; LINUX-LABEL:main4k:
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with frame pointer
+define i32 @main4k_frame() nounwind "no-frame-pointer-elim"="true" {
+entry:
+; WIN_X64-LABEL:main4k_frame:
+; WIN_X64:      movq    %rcx,   16(%rsp)
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-LABEL:main4k_frame:
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with INT args
+define i32 @main4k_intargs(i32 %x, i32 %y) nounwind {
+entry:
+; WIN_X64:      movq    %rcx,   8(%rsp)
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  %t = add i32 %x, %y
+  ret i32 %t
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with FP regs
+define i32 @main4k_fpargs(double %x, double %y) nounwind {
+entry:
+; WIN_X64:      movq    %rcx,   8(%rsp)
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+; Prolog stack allocation >= 4096 bytes will require the probe sequence
+; Case with mixed regs
+define i32 @main4k_mixargs(double %x, i32 %y) nounwind {
+entry:
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  ret i32 %y
+}
+
+; Make sure we don't emit the probe for a smaller prolog stack allocation.
+define i32 @main128() nounwind {
+entry:
+; WIN_X64-NOT:  movq    %gs:16, %rcx
+; WIN_X64:      retq
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [128 x i8]
+  ret i32 0
+}
+
+; Make sure we don't emit the probe sequence if not on windows even if the
+; caller has the Win64 calling convention.
+define x86_64_win64cc i32 @main4k_win64() nounwind {
+entry:
+; WIN_X64:      movq    %gs:16, %rcx
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
+
+declare i32 @bar(i8*) nounwind
+
+; Within-body inline probe expansion
+define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind {
+entry:
+; WIN_X64:     callq   bar
+; WIN_X64:     movq    %gs:16, [[R:%r.*]]
+; WIN_X64:     callq   bar
+; LINUX:       callq   bar
+; LINUX-NOT:   movq    %gs:16, [[R:%r.*]]
+; LINUX:       callq   bar
+  %a = alloca i8, i64 1024
+  %ra = call i32 @bar(i8* %a) nounwind
+  %b = alloca i8, i64 %n
+  %rb = call i32 @bar(i8* %b) nounwind
+  %r = add i32 %ra, %rb
+  ret i32 %r
+}
+
+; Influence of stack-probe-size attribute
+; Note this is not exposed in coreclr
+define i32 @test_probe_size() "stack-probe-size"="8192" nounwind {
+; WIN_X64-NOT:  movq    %gs:16, %rcx
+; WIN_X64:     retq
+; LINUX-NOT:    movq    %gs:16, %rcx
+; LINUX:       retq
+  %a = alloca [4096 x i8]
+  ret i32 0
+}
author	Andy Ayers <andya@microsoft.com>
	Tue, 10 Nov 2015 01:50:49 +0000 (01:50 +0000)
committer	Andy Ayers <andya@microsoft.com>
	Tue, 10 Nov 2015 01:50:49 +0000 (01:50 +0000)
include/llvm/Target/TargetFrameLowering.h		patch \| blob \| history
lib/CodeGen/PrologEpilogInserter.cpp		patch \| blob \| history
lib/Target/X86/X86FrameLowering.cpp		patch \| blob \| history
lib/Target/X86/X86FrameLowering.h		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/stack-probe-size.ll		patch \| blob \| history
test/CodeGen/X86/win_coreclr_chkstk.ll	[new file with mode: 0644]	patch \| blob