From: David Majnemer Date: Tue, 5 Jan 2016 02:32:06 +0000 (+0000) Subject: Revert "[X86] Use push-pop for materializing small constants under 'minsize'" X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=7f4aa70ca5194c85f162a0d040e939b69460bb69;p=oota-llvm.git Revert "[X86] Use push-pop for materializing small constants under 'minsize'" The red zone consists of 128 bytes beyond the stack pointer so that the allocation of objects in leaf functions doesn't require decrementing rsp. In r255656, we introduced an optimization that would cheaply materialize certain constants via push/pop. Push decrements the stack pointer and stores it's result at what is now the top of the stack. However, this means that using push/pop would encroach on the red zone. PR26023 gives an example where this corrupts an object in the red zone. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256808 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 4414e478b99..868ae4e19e5 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -157,13 +157,9 @@ namespace { /// performance. bool OptForSize; - /// If true, selector should try to optimize for minimum code size. - bool OptForMinSize; - public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false), - OptForMinSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -535,10 +531,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptFor[Min]Size are used in pattern predicates that isel is matching. + // OptForSize is used in pattern predicates that isel is matching. OptForSize = MF->getFunction()->optForSize(); - OptForMinSize = MF->getFunction()->optForMinSize(); - assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 5d7283f7bd5..a585775f84e 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -250,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, AddedComplexity = 20 in + isPseudo = 1 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; @@ -263,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { } let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], - AddedComplexity = 15 in { + AddedComplexity = 1 in { // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, // which only require 3 bytes compared to MOV32ri which requires 5. let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { @@ -278,17 +278,6 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; } -let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in { -// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. -// FIXME: Add itinerary class and Schedule. -def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", - [(set GR32:$dst, i32immSExt8:$src)]>, - Requires<[OptForMinSize]>; -def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", - [(set GR64:$dst, i64immSExt8:$src)]>, - Requires<[OptForMinSize, NotWin64WithoutFP]>; -} - // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index bd4d39ec981..6799a5e01c2 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -23,7 +23,6 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -5314,50 +5313,6 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, return true; } -bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { - MachineBasicBlock &MBB = *MIB->getParent(); - DebugLoc DL = MIB->getDebugLoc(); - int64_t Imm = MIB->getOperand(1).getImm(); - assert(Imm != 0 && "Using push/pop for 0 is not efficient."); - MachineBasicBlock::iterator I = MIB.getInstr(); - - int StackAdjustment; - - if (Subtarget.is64Bit()) { - assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || - MIB->getOpcode() == X86::MOV32ImmSExti8); - // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and - // widen the register if necessary. - StackAdjustment = 8; - BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); - MIB->setDesc(get(X86::POP64r)); - MIB->getOperand(0) - .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); - } else { - assert(MIB->getOpcode() == X86::MOV32ImmSExti8); - StackAdjustment = 4; - BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); - MIB->setDesc(get(X86::POP32r)); - } - - // Build CFI if necessary. - MachineFunction &MF = *MBB.getParent(); - const X86FrameLowering *TFL = Subtarget.getFrameLowering(); - bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsDwarfCFI = - !IsWin64Prologue && - (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); - bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; - if (EmitCFI) { - TFL->BuildCFI(MBB, I, DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); - TFL->BuildCFI(MBB, std::next(I), DL, - MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); - } - - return true; -} - // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -5390,9 +5345,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); case X86::MOV32r_1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); - case X86::MOV32ImmSExti8: - case X86::MOV64ImmSExti8: - return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 9d40334206b..edd09d61759 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -23,7 +23,6 @@ #include "X86GenInstrInfo.inc" namespace llvm { - class MachineInstrBuilder; class X86RegisterInfo; class X86Subtarget; @@ -565,9 +564,6 @@ private: /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const; - - /// Expand the MOVImmSExti8 pseudo-instructions. - bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 06ac4ee074c..c21c0c679e1 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -822,8 +822,6 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; -def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" - "Subtarget->getFrameLowering()->hasFP(*MF)">; def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -837,7 +835,6 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; def OptForSize : Predicate<"OptForSize">; -def OptForMinSize : Predicate<"OptForMinSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll new file mode 100644 index 00000000000..49da8008b88 --- /dev/null +++ b/test/CodeGen/X86/materialize-one.ll @@ -0,0 +1,100 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64 + +define i32 @one32() optsize { +entry: + ret i32 1 + +; CHECK32-LABEL: one32 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: ret + +; FIXME: Figure out the best approach in 64-bit mode. +; CHECK64-LABEL: one32 +; CHECK64: movl $1, %eax +; CHECK64-NEXT: retq +} + +define i32 @minus_one32() optsize { +entry: + ret i32 -1 + +; CHECK32-LABEL: minus_one32 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: ret +} + +define i16 @one16() optsize { +entry: + ret i16 1 + +; CHECK32-LABEL: one16 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl +} + +define i16 @minus_one16() optsize { +entry: + ret i16 -1 + +; CHECK32-LABEL: minus_one16 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + +define i32 @test_rematerialization() optsize { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; -1 should be re-materialized here instead of getting spilled above. + ret i32 -1 + +; CHECK32-LABEL: test_rematerialization +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NOT: %eax +; CHECK32: retl +} + +define i32 @test_rematerialization2(i32 %x) optsize { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; Define eflags. + %a = icmp ne i32 %x, 123 + %b = zext i1 %a to i32 + ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. + ; It must therefore not use the xor-dec lowering. + %c = select i1 %a, i32 %b, i32 -1 + ret i32 %c + +; CHECK32-LABEL: test_rematerialization2 +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: cmpl +; CHECK32: setne +; CHECK32-NOT: xorl +; CHECK32: movl $-1 +; CHECK32: cmov +; CHECK32: retl +} + +declare x86_thiscallcc void @f(i32) diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll deleted file mode 100644 index 695bf0fa5b9..00000000000 --- a/test/CodeGen/X86/materialize.ll +++ /dev/null @@ -1,184 +0,0 @@ -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64 -; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64 - -define i32 @one32_nooptsize() { -entry: - ret i32 1 - -; When not optimizing for size, use mov. -; CHECK32-LABEL: one32_nooptsize: -; CHECK32: movl $1, %eax -; CHECK32-NEXT: retl -; CHECK64-LABEL: one32_nooptsize: -; CHECK64: movl $1, %eax -; CHECK64-NEXT: retq -} - -define i32 @one32() optsize { -entry: - ret i32 1 - -; CHECK32-LABEL: one32: -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: incl %eax -; CHECK32-NEXT: retl - -; FIXME: Figure out the best approach in 64-bit mode. -; CHECK64-LABEL: one32: -; CHECK64: movl $1, %eax -; CHECK64-NEXT: retq -} - -define i32 @one32_minsize() minsize { -entry: - ret i32 1 - -; On 32-bit, xor-inc is preferred over push-pop. -; CHECK32-LABEL: one32_minsize: -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: incl %eax -; CHECK32-NEXT: retl - -; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to -; pop into a 64-bit register even when we just need 32 bits. -; CHECK64-LABEL: one32_minsize: -; CHECK64: pushq $1 -; CHECK64: .cfi_adjust_cfa_offset 8 -; CHECK64: popq %rax -; CHECK64: .cfi_adjust_cfa_offset -8 -; CHECK64-NEXT: retq -} - -define i64 @one64_minsize() minsize { -entry: - ret i64 1 -; On 64-bit we don't do xor-inc yet, so push-pop it is. -; CHECK64-LABEL: one64_minsize: -; CHECK64: pushq $1 -; CHECK64: .cfi_adjust_cfa_offset 8 -; CHECK64: popq %rax -; CHECK64: .cfi_adjust_cfa_offset -8 -; CHECK64-NEXT: retq - -; On Win64 we can't adjust the stack unless there's a frame pointer. -; CHECKWIN64-LABEL: one64_minsize: -; CHECKWIN64: movl $1, %eax -; CHECKWIN64-NEXT: retq -} - -define i32 @minus_one32() optsize { -entry: - ret i32 -1 - -; CHECK32-LABEL: minus_one32: -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NEXT: retl -} - -define i32 @minus_one32_minsize() minsize { -entry: - ret i32 -1 - -; xor-dec is preferred over push-pop. -; CHECK32-LABEL: minus_one32_minsize: -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NEXT: retl -} - -define i16 @one16() optsize { -entry: - ret i16 1 - -; CHECK32-LABEL: one16: -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: incl %eax -; CHECK32-NEXT: retl -} - -define i16 @minus_one16() optsize { -entry: - ret i16 -1 - -; CHECK32-LABEL: minus_one16: -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NEXT: retl -} - -define i32 @minus_five32() minsize { -entry: - ret i32 -5 - -; CHECK32-LABEL: minus_five32: -; CHECK32: pushl $-5 -; CHECK32: popl %eax -; CHECK32: retl -} - -define i64 @minus_five64() minsize { -entry: - ret i64 -5 - -; CHECK64-LABEL: minus_five64: -; CHECK64: pushq $-5 -; CHECK64: .cfi_adjust_cfa_offset 8 -; CHECK64: popq %rax -; CHECK64: .cfi_adjust_cfa_offset -8 -; CHECK64: retq -} - -define i32 @rematerialize_minus_one() optsize { -entry: - ; Materialize -1 (thiscall forces it into %ecx). - tail call x86_thiscallcc void @f(i32 -1) - - ; Clobber all registers except %esp, leaving nowhere to store the -1 besides - ; spilling it to the stack. - tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() - - ; -1 should be re-materialized here instead of getting spilled above. - ret i32 -1 - -; CHECK32-LABEL: rematerialize_minus_one -; CHECK32: xorl %ecx, %ecx -; CHECK32-NEXT: decl %ecx -; CHECK32: calll -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NOT: %eax -; CHECK32: retl -} - -define i32 @rematerialize_minus_one_eflags(i32 %x) optsize { -entry: - ; Materialize -1 (thiscall forces it into %ecx). - tail call x86_thiscallcc void @f(i32 -1) - - ; Clobber all registers except %esp, leaving nowhere to store the -1 besides - ; spilling it to the stack. - tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() - - ; Define eflags. - %a = icmp ne i32 %x, 123 - %b = zext i1 %a to i32 - ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. - ; It must therefore not use the xor-dec lowering. - %c = select i1 %a, i32 %b, i32 -1 - ret i32 %c - -; CHECK32-LABEL: rematerialize_minus_one_eflags -; CHECK32: xorl %ecx, %ecx -; CHECK32-NEXT: decl %ecx -; CHECK32: calll -; CHECK32: cmpl -; CHECK32: setne -; CHECK32-NOT: xorl -; CHECK32: movl $-1 -; CHECK32: cmov -; CHECK32: retl -} - -declare x86_thiscallcc void @f(i32) diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll index 17d3e3e7d33..88b5f4eb14b 100644 --- a/test/CodeGen/X86/powi.ll +++ b/test/CodeGen/X86/powi.ll @@ -29,9 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize { define double @pow_wrapper_minsize(double %a) minsize { ; CHECK-LABEL: pow_wrapper_minsize: ; CHECK: # BB#0: -; CHECK-NEXT: movl $128, %edi +; CHECK-NEXT: movl $15, %edi ; CHECK-NEXT: jmp - %ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; [#uses=1] + %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] ret double %ret }