From: Michael Kuperstein <michael.m.kuperstein@intel.com>
Date: Tue, 9 Dec 2014 06:10:44 +0000 (+0000)
Subject: [X86] Convert esp-relative movs of function arguments into pushes, step 1
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=77c1b73211b4277820d87275d102933e4cd7ee10;p=oota-llvm.git

[X86] Convert esp-relative movs of function arguments into pushes, step 1

This handles the simplest case for mov -> push conversion:
1. x86-32 calling convention, everything is passed through the stack.
2. There is no reserved call frame.
3. Only registers or immediates are pushed, no attempt to combine a mem-reg-mem sequence into a single PUSHmm.

Differential Revision: http://reviews.llvm.org/D6503

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@223757 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index e7f9c11dbcf..a3a3dce87fa 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -93,6 +93,15 @@ static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
   return X86::AND32ri;
 }
 
+static unsigned getPUSHiOpcode(bool IsLP64, int64_t Imm) {
+  // We don't support LP64 for now.
+  assert(!IsLP64);
+
+  if (isInt<8>(Imm))
+    return X86::PUSH32i8;
+  return X86::PUSHi32;
+}
+
 static unsigned getLEArOpcode(unsigned IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
@@ -1802,6 +1811,103 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
 #endif
 }
 
+bool X86FrameLowering::
+convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I, uint64_t Amount) const {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+    MF.getSubtarget().getRegisterInfo());
+  unsigned StackPtr = RegInfo.getStackRegister();
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
+  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // no gaps.  
+  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
+  do {
+    int Opcode = I->getOpcode();
+    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+      break;
+ 
+    // We only want movs of the form:
+    // movl imm/r32, k(%ecx)
+    // If we run into something else, bail
+    // Note that AddrBaseReg may, counterintuitively, not be a register...
+    if (!I->getOperand(X86::AddrBaseReg).isReg() || 
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return false;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    
+    // We don't want to consider the unaligned case.
+    if (StackDisp % 4)
+      return false;
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
+      return false;
+
+    ++I;
+  } while (I != MBB.end());
+
+  // We now expect the end of the sequence - a call and a stack adjust.
+  if (I == MBB.end())
+    return false;
+  if (!I->isCall())
+    return false;
+  MachineBasicBlock::iterator Call = I;
+  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
+    return false;
+
+  // Now, go through the map, and see that we don't have any gaps,
+  // but only a series of 32-bit MOVs.
+  // Since std::map provides ordered iteration, the original order
+  // of the MOVs doesn't matter.
+  int64_t ExpectedDist = 0;
+  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; 
+       ++MMI, ExpectedDist += 4)
+    if (MMI->first != ExpectedDist)
+      return false;
+
+  // Ok, everything looks fine. Do the transformation.
+  DebugLoc DL = I->getDebugLoc();
+
+  // It's possible the original stack adjustment amount was larger than
+  // that done by the pushes. If so, we still need a SUB.
+  Amount -= ExpectedDist;
+  if (Amount) {
+    MachineInstr* Sub = BuildMI(MBB, Call, DL,
+                          TII.get(getSUBriOpcode(false, Amount)), StackPtr)
+                  .addReg(StackPtr).addImm(Amount);
+    Sub->getOperand(3).setIsDead();
+  }
+
+  // Now, iterate through the map in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
+  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
+    MachineBasicBlock::iterator MOV = MMI->second;
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    int PushOpcode;
+    if (MOV->getOpcode() == X86::MOV32mi) {
+      int64_t Val = PushOp.getImm();
+      BuildMI(MBB, Call, DL, TII.get(getPUSHiOpcode(false, Val)))
+        .addImm(Val);
+    } else {
+      PushOpcode = X86::PUSH32r;
+      BuildMI(MBB, Call, DL, TII.get(X86::PUSH32r))
+        .addReg(PushOp.getReg());
+    }
+    MBB.erase(MOV);
+  }
+
+  return true;
+}
+
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -1809,21 +1915,20 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
                                        MF.getSubtarget().getRegisterInfo());
   unsigned StackPtr = RegInfo.getStackRegister();
-  bool reseveCallFrame = hasReservedCallFrame(MF);
+  bool reserveCallFrame = hasReservedCallFrame(MF);
   int Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
   uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
   I = MBB.erase(I);
 
-  if (!reseveCallFrame) {
+  if (!reserveCallFrame) {
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub ESP, <amt>' and the
     // adjcallstackdown instruction into 'add ESP, <amt>'
-    // TODO: consider using push / pop instead of sub + store / add
     if (Amount == 0)
       return;
 
@@ -1838,6 +1943,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
     MachineInstr *New = nullptr;
     if (Opcode == TII.getCallFrameSetupOpcode()) {
+      // Try to convert movs to the stack into pushes.
+      // We currently only look for a pattern that appears in 32-bit
+      // calling conventions.
+      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
+        return;
+
       New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
                     StackPtr)
         .addReg(StackPtr)
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 2ee71159c19..ee0ee227cad 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -76,6 +76,16 @@ public:
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                  MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI) const override;
+
+private:
+  /// convertArgMovsToPushes - This method tries to convert a call sequence
+  /// that uses sub and mov instructions to put the argument onto the stack
+  /// into a series of pushes.
+  /// Returns true if the transformation succeeded, false if not.
+  bool convertArgMovsToPushes(MachineFunction &MF, 
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, 
+                              uint64_t Amount) const;
 };
 
 } // End llvm namespace
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
index 95defc83db1..bd980694392 100644
--- a/test/CodeGen/X86/force-align-stack-alloca.ll
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -33,14 +33,14 @@ define i64 @g(i32 %i) nounwind {
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ;
 ; Next we set up the memset call, and then undo it.
-; CHECK:      subl   $32, %esp
+; CHECK:      subl   $20, %esp
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ; CHECK:      calll  memset
 ; CHECK-NEXT: addl   $32, %esp
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ;
 ; Next we set up the call to 'f'.
-; CHECK:      subl   $32, %esp
+; CHECK:      subl   $28, %esp
 ; CHECK-NOT:         {{[^ ,]*}}, %esp
 ; CHECK:      calll  f
 ; CHECK-NEXT: addl   $32, %esp
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll
index 7cfa9291357..b1781d30f91 100644
--- a/test/CodeGen/X86/inalloca-ctor.ll
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -17,16 +17,16 @@ entry:
 ; CHECK: movl %esp,
   call void @Foo_ctor(%Foo* %c)
 ; CHECK: leal 12(%{{.*}}),
-; CHECK: subl $4, %esp
-; CHECK: calll _Foo_ctor
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _Foo_ctor
 ; CHECK: addl $4, %esp
   %b = getelementptr %frame* %args, i32 0, i32 1
   store i32 42, i32* %b
 ; CHECK: movl $42,
   %a = getelementptr %frame* %args, i32 0, i32 0
   call void @Foo_ctor(%Foo* %a)
-; CHECK: subl $4, %esp
-; CHECK: calll _Foo_ctor
+; CHECK-NEXT: pushl
+; CHECK-NEXT: calll _Foo_ctor
 ; CHECK: addl $4, %esp
   call void @f(%frame* inalloca %args)
 ; CHECK: calll   _f
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll
index 6cff9ac0640..b56f24d9962 100644
--- a/test/CodeGen/X86/inalloca-invoke.ll
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -37,7 +37,7 @@ blah:
 invoke.cont:
   call void @begin(%Iter* sret %beg)
 
-; CHECK:  movl %[[beg]],
+; CHECK:  pushl %[[beg]]
 ; CHECK:  calll _begin
 
   invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args)
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index 54f97d99a9c..e5b07e262c7 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -19,7 +19,7 @@ define void @g() {
   call x86_stdcallcc void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f@8
 ; CHECK-NOT: %esp
-; CHECK: subl $4, %esp
+; CHECK: pushl
 ; CHECK: calll   _i@4
   call x86_stdcallcc void @i(i32 0)
   ret void
diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll
index dd7f3964020..9a6de3dd1d9 100644
--- a/test/CodeGen/X86/mem-intrin-base-reg.ll
+++ b/test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -63,7 +63,7 @@ spill_vectors:
 ; CHECK-LABEL: _memcpy_vla_vector:
 ; CHECK: andl $-16, %esp
 ; CHECK: movl %esp, %esi
-; CHECK: movl $128, {{.*}}(%esp)
+; CHECK: pushl $128
 ; CHECK: calll _memcpy
 ; CHECK: calll __chkstk
 
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
new file mode 100644
index 00000000000..6a848b7dc91
--- /dev/null
+++ b/test/CodeGen/X86/movtopush.ll
@@ -0,0 +1,97 @@
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
+declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
+declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
+
+; Here, we should have a reserved frame, so we don't expect pushes
+; NORMAL-LABEL: test1
+; NORMAL: subl    $16, %esp
+; NORMAL-NEXT: movl    $4, 12(%esp)
+; NORMAL-NEXT: movl    $3, 8(%esp)
+; NORMAL-NEXT: movl    $2, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: call
+define void @test1() {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Here, we expect a sequence of 4 immediate pushes
+; NORMAL-LABEL: test2
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+define void @test2(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Again, we expect a sequence of 4 immediate pushes
+; Checks that we generate the right pushes for >8bit immediates
+; NORMAL-LABEL: test2b
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4096
+; NORMAL-NEXT: pushl   $3072
+; NORMAL-NEXT: pushl   $2048
+; NORMAL-NEXT: pushl   $1024
+; NORMAL-NEXT: call
+define void @test2b(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
+  ret void
+}
+
+; The first push should push a register
+; NORMAL-LABEL: test3
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   %e{{..}}
+; NORMAL-NEXT: call
+define void @test3(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 %k, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; We don't support weird calling conventions
+; NORMAL-LABEL: test4
+; NORMAL: subl    $12, %esp
+; NORMAL-NEXT: movl    $4, 8(%esp)
+; NORMAL-NEXT: movl    $3, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: movl    $2, %eax
+; NORMAL-NEXT: call
+define void @test4(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @inreg(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Check that additional alignment is added when the pushes
+; don't add up to the required alignment.
+; ALIGNED-LABEL: test5
+; ALIGNED: subl    $16, %esp
+; ALIGNED-NEXT: pushl   $4
+; ALIGNED-NEXT: pushl   $3
+; ALIGNED-NEXT: pushl   $2
+; ALIGNED-NEXT: pushl   $1
+; ALIGNED-NEXT: call
+define void @test5(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+