From: Evan Cheng Date: Tue, 22 Dec 2009 17:47:23 +0000 (+0000) Subject: Remove target attribute break-sse-dep. Instead, do not fold load into sse partial... X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=b1f49813334278094b1ecd7ad920f5c276f7b3e6;p=oota-llvm.git Remove target attribute break-sse-dep. Instead, do not fold load into sse partial update instructions unless optimizing for size. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@91910 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 313177eac08..a6e1ca3128e 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -57,8 +57,6 @@ def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", "Support 64-bit instructions">; def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; -def FeatureBreakSSEDep : SubtargetFeature<"break-sse-dep", "BreakSSEDep","true", - "Should break SSE partial update dep with load / xorps">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions">; @@ -88,27 +86,17 @@ def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; def : Proc<"pentium3", [FeatureSSE1]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; -def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; -def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, - FeatureBreakSSEDep]>; +def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>; // Sandy Bridge does not have FMA -def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit, - FeatureBreakSSEDep]>; +def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index e1e6ff3b1a9..b0e6cf7d3a2 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2370,7 +2370,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check switch flag if (NoFusing) return NULL; - if (TM.getSubtarget().shouldBreakSSEDep()) + if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize)) switch (MI->getOpcode()) { case X86::CVTSD2SSrr: case X86::Int_CVTSD2SSrr: @@ -2422,7 +2422,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check switch flag if (NoFusing) return NULL; - if (TM.getSubtarget().shouldBreakSSEDep()) + if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize)) switch (MI->getOpcode()) { case X86::CVTSD2SSrr: case X86::Int_CVTSD2SSrr: diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 0b6efaa755c..3f09cb00958 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -298,11 +298,10 @@ def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" "TM.getCodeModel() == CodeModel::Kernel">; def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; +def OptForSize : Predicate<"OptForSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; -def SSEBreakDep : Predicate<"Subtarget->shouldBreakSSEDep() && !OptForSize">; -def NoSSEBreakDep: Predicate<"!Subtarget->shouldBreakSSEDep() || OptForSize">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 694b91ea429..b26e5086920 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -827,7 +827,7 @@ multiclass sse1_fp_unop_rm opc, string OpcodeStr, def SSm : I, XS, - Requires<[HasSSE1, NoSSEBreakDep]>; + Requires<[HasSSE1, OptForSize]>; // Vector operation, reg. def PSr : PSI, XD, - Requires<[HasSSE2, NoSSEBreakDep]>; + Requires<[HasSSE2, OptForSize]>; def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src), "cvtsi2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (sint_to_fp GR32:$src))]>; @@ -1157,10 +1157,10 @@ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, - Requires<[HasSSE2, NoSSEBreakDep]>; + Requires<[HasSSE2, OptForSize]>; def : Pat<(extloadf32 addr:$src), - (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[SSEBreakDep]>; + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>; // Match intrinsics which expect XMM operand(s). def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), @@ -3232,7 +3232,7 @@ multiclass sse41_fp_unop_rm opcps, bits<8> opcpd, [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>, TA, OpSize, - Requires<[HasSSE41, NoSSEBreakDep]>; + Requires<[HasSSE41]>; // Vector intrinsic operation, reg def PDr_Int : SS4AIi8= 13); - BreakSSEDep = IsIntel; GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); HasX86_64 = (EDX >> 29) & 0x1; @@ -287,7 +286,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, , HasFMA3(false) , HasFMA4(false) , IsBTMemSlow(false) - , BreakSSEDep(false) , DarwinVers(0) , stackAlignment(8) // FIXME: this is a known good value for Yonah. How about others? diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index d9040aa3098..ef6dbafac34 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -78,14 +78,6 @@ protected: /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; - /// BreakSSEDep - True if codegen should unfold load or insert xorps / pxor - /// to break register dependency for a partial register update SSE - /// instruction. This is needed for instructions such as CVTSS2SD which - /// only update the lower part of the register, and the result of the updated - /// part does not depend on the contents of the destination before the - /// instruction, and the non-updated portion of the register is not used. - bool BreakSSEDep; - /// DarwinVers - Nonzero if this is a darwin platform: the numeric /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc. unsigned char DarwinVers; // Is any darwin-x86 platform. @@ -150,7 +142,6 @@ public: bool hasFMA3() const { return HasFMA3; } bool hasFMA4() const { return HasFMA4; } bool isBTMemSlow() const { return IsBTMemSlow; } - bool shouldBreakSSEDep() const { return BreakSSEDep; } bool isTargetDarwin() const { return TargetType == isDarwin; } bool isTargetELF() const { return TargetType == isELF; } diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll index 00c943f78db..acc0647bc87 100644 --- a/test/CodeGen/X86/break-sse-dep.ll +++ b/test/CodeGen/X86/break-sse-dep.ll @@ -1,27 +1,20 @@ -; RUN: llc < %s -march=x86-64 -mattr=+sse2,+break-sse-dep | FileCheck %s --check-prefix=YES -; RUN: llc < %s -march=x86-64 -mattr=+sse2,-break-sse-dep | FileCheck %s --check-prefix=NO +; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s define double @t1(float* nocapture %x) nounwind readonly ssp { entry: -; YES: t1: -; YES: movss (%rdi), %xmm0 -; YES; cvtss2sd %xmm0, %xmm0 +; CHECK: t1: +; CHECK: movss (%rdi), %xmm0 +; CHECK; cvtss2sd %xmm0, %xmm0 -; NO: t1: -; NO; cvtss2sd (%rdi), %xmm0 %0 = load float* %x, align 4 %1 = fpext float %0 to double ret double %1 } -define float @t2(double* nocapture %x) nounwind readonly ssp { +define float @t2(double* nocapture %x) nounwind readonly ssp optsize { entry: -; YES: t2: -; YES: movsd (%rdi), %xmm0 -; YES; cvtsd2ss %xmm0, %xmm0 - -; NO: t2: -; NO; cvtsd2ss (%rdi), %xmm0 +; CHECK: t2: +; CHECK; cvtsd2ss (%rdi), %xmm0 %0 = load double* %x, align 8 %1 = fptrunc double %0 to float ret float %1