From 1031549bec52ab3c5dd9f4dd7703e21bb5253bff Mon Sep 17 00:00:00 2001
From: Robert Lougher <rob.lougher@gmail.com>
Date: Thu, 29 Jan 2015 16:18:29 +0000
Subject: [PATCH] [X86] Use single add/sub for large stack offsets

For large stack offsets the compiler generates multiple immediate mode
sub/add instructions in the prologue/epilogue.  This patch makes the
compiler place the final amount to be added/subtracted into a register,
which is then added/substracted with a single operation.

Differential Revision: http://reviews.llvm.org/D7226


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227458 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86FrameLowering.cpp   | 59 +++++++++++++++++++++------
 test/CodeGen/X86/huge-stack-offset.ll | 59 +++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 test/CodeGen/X86/huge-stack-offset.ll

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 16aab16d63e..bb2007dd682 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -82,6 +82,14 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
   }
 }
 
+static unsigned getSUBrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
 static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
   if (IsLP64) {
     if (isInt<8>(Imm))
@@ -165,6 +173,18 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
   return 0;
 }
 
+static bool isEAXLiveIn(MachineFunction &MF) {
+  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+    unsigned Reg = II->first;
+
+    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+        Reg == X86::AH || Reg == X86::AL)
+      return true;
+  }
+
+  return false;
+}
 
 /// emitSPUpdate - Emit a series of instructions to increment / decrement the
 /// stack pointer by a constant value.
@@ -187,6 +207,32 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   while (Offset) {
+    if (Offset > Chunk) {
+      // Rather than emit a long series of instructions for large offsets,
+      // load the offset into a register and do one sub/add
+      unsigned Reg = 0;
+
+      if (isSub && !isEAXLiveIn(*MBB.getParent()))
+        Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX);
+      else
+        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
+
+      if (Reg) {
+        Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri;
+        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
+          .addImm(Offset);
+        Opc = isSub
+          ? getSUBrrOpcode(Is64BitTarget)
+          : getADDrrOpcode(Is64BitTarget);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr)
+          .addReg(Reg);
+        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+        Offset = 0;
+        continue;
+      }
+    }
+
     uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
     if (ThisVal == (Is64BitTarget ? 8 : 4)) {
       // Use push / pop instead.
@@ -316,19 +362,6 @@ static int mergeSPUpdates(MachineBasicBlock &MBB,
   return Offset;
 }
 
-static bool isEAXLiveIn(MachineFunction &MF) {
-  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
-       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
-    unsigned Reg = II->first;
-
-    if (Reg == X86::EAX || Reg == X86::AX ||
-        Reg == X86::AH || Reg == X86::AL)
-      return true;
-  }
-
-  return false;
-}
-
 void
 X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                             MachineBasicBlock::iterator MBBI,
diff --git a/test/CodeGen/X86/huge-stack-offset.ll b/test/CodeGen/X86/huge-stack-offset.ll
new file mode 100644
index 00000000000..619516101a8
--- /dev/null
+++ b/test/CodeGen/X86/huge-stack-offset.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=x86_64-linux-unknown | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=i386-linux-unknown | FileCheck %s --check-prefix=CHECK-32
+
+; Test that a large stack offset uses a single add/sub instruction to
+; adjust the stack pointer.
+
+define void @foo() nounwind {
+; CHECK-64-LABEL: foo:
+; CHECK-64:      movabsq $50000000{{..}}, %rax
+; CHECK-64-NEXT: subq    %rax, %rsp
+; CHECK-64-NOT:  subq    $2147483647, %rsp
+; CHECK-64:      movabsq $50000000{{..}}, [[RAX:%r..]]
+; CHECK-64-NEXT: addq    [[RAX]], %rsp
+
+; CHECK-32-LABEL: foo:
+; CHECK-32:      movl    $50000000{{..}}, %eax
+; CHECK-32-NEXT: subl    %eax, %esp
+; CHECK-32-NOT:  subl    $2147483647, %esp
+; CHECK-32:      movl    $50000000{{..}}, [[EAX:%e..]]
+; CHECK-32-NEXT: addl    [[EAX]], %esp
+  %1 = alloca [5000000000 x i8], align 16
+  %2 = getelementptr inbounds [5000000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret void
+}
+
+; Verify that we do not clobber the return value.
+
+define i32 @foo2() nounwind {
+; CHECK-64-LABEL: foo2:
+; CHECK-64:     movl    $10, %eax
+; CHECK-64-NOT: movabsq ${{.*}}, %rax
+
+; CHECK-32-LABEL: foo2:
+; CHECK-32:     movl    $10, %eax
+; CHECK-32-NOT: movl    ${{.*}}, %eax
+  %1 = alloca [5000000000 x i8], align 16
+  %2 = getelementptr inbounds [5000000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 10
+}
+
+; Verify that we do not clobber EAX when using inreg attribute
+
+define i32 @foo3(i32 inreg %x) nounwind {
+; CHECK-64-LABEL: foo3:
+; CHECK-64:      movabsq $50000000{{..}}, %rax
+; CHECK-64-NEXT: subq    %rax, %rsp
+
+; CHECK-32-LABEL: foo3:
+; CHECK-32:      subl $2147483647, %esp
+; CHECK-32-NOT:  movl ${{.*}}, %eax
+  %1 = alloca [5000000000 x i8], align 16
+  %2 = getelementptr inbounds [5000000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 %x
+}
+
+declare void @bar(i8*)
-- 
2.34.1