Significantly simplify and improve handling of FP function results on x86-32.

author Chris Lattner <sabre@nondot.org>

Thu, 24 Jan 2008 08:07:48 +0000 (08:07 +0000)

committer Chris Lattner <sabre@nondot.org>

Thu, 24 Jan 2008 08:07:48 +0000 (08:07 +0000)
author Chris Lattner <sabre@nondot.org>
Thu, 24 Jan 2008 08:07:48 +0000 (08:07 +0000)
committer Chris Lattner <sabre@nondot.org>
Thu, 24 Jan 2008 08:07:48 +0000 (08:07 +0000)
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp

index 9887bcccef41b52ff0435ed1871ba4d20f195adb..53cd6121d95490460c6a9f360dfa411b52c29d19 100644 (file)
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -156,7 +156,8 @@ namespace {
      bool TryFoldLoad(SDOperand P, SDOperand N,
                       SDOperand &Base, SDOperand &Scale,
                       SDOperand &Index, SDOperand &Disp);
-    void InstructionSelectPreprocess(SelectionDAG &DAG);
+    void PreprocessForRMW(SelectionDAG &DAG);
+    void PreprocessForFPConvert(SelectionDAG &DAG);
  
      /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
      /// inline asm expressions.
@@ -350,9 +351,10 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
                           Store.getOperand(2), Store.getOperand(3));
  }
  
-/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction
-/// selector to pick more load-modify-store instructions. This is a common
-/// case:
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// This is only run if not in -fast mode (aka -O0).
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
  ///
  ///     [Load chain]
  ///         ^
@@ -389,7 +391,7 @@ static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
  ///       \      /
  ///        \    /
  ///       [Store]
-void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
+void X86DAGToDAGISel::PreprocessForRMW(SelectionDAG &DAG) {
    for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
           E = DAG.allnodes_end(); I != E; ++I) {
      if (!ISD::isNON_TRUNCStore(I))
@@ -459,6 +461,66 @@ void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
    }
  }
  
+
+/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
+/// nodes that target the FP stack to be store and load to the stack.  This is a
+/// gross hack.  We would like to simply mark these as being illegal, but when
+/// we do that, legalize produces these when it expands calls, then expands
+/// these in the same legalize pass.  We would like dag combine to be able to
+/// hack on these between the call expansion and the node legalization.  As such
+/// this pass basically does "really late" legalization of these inline with the
+/// X86 isel pass.
+void X86DAGToDAGISel::PreprocessForFPConvert(SelectionDAG &DAG) {
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+    
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    MVT::ValueType SrcVT = N->getOperand(0).getValueType();
+    MVT::ValueType DstVT = N->getValueType(0);
+    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    // If this is an FPStack extension (but not a truncation), it is a noop.
+    if (!SrcIsSSE && !DstIsSSE && N->getOpcode() == ISD::FP_EXTEND)
+      continue;
+    
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    MVT::ValueType MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+    
+    SDOperand MemTmp = DAG.CreateStackTemporary(MemVT);
+    
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDOperand Store = DAG.getTruncStore(DAG.getEntryNode(), N->getOperand(0),
+                                        MemTmp, NULL, 0, MemVT);
+    SDOperand Result = DAG.getExtLoad(ISD::EXTLOAD, DstVT, Store, MemTmp,
+                                      NULL, 0, MemVT);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    DAG.ReplaceAllUsesOfValueWith(SDOperand(N, 0), Result);
+    
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    DAG.DeleteNode(N);
+  }  
+}
+
  /// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
  /// when it has created a SelectionDAG for us to codegen.
  void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
@@ -466,7 +528,10 @@ void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
    MachineFunction::iterator FirstMBB = BB;
  
    if (!FastISel)
-    InstructionSelectPreprocess(DAG);
+    PreprocessForRMW(DAG);
+
+  // FIXME: This should only happen when not -fast.
+  PreprocessForFPConvert(DAG);
  
    // Codegen the basic block.
  #ifndef NDEBUG
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 9140539e3828c238b29cfaa4a93d6b97308e52af..9ca99fe6dd0278670b0ce60af46a69bda6e12650 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -47,6 +47,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
    X86ScalarSSEf32 = Subtarget->hasSSE1();
    X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
    
+  bool Fast = false;
  
    RegInfo = TM.getRegisterInfo();
  
@@ -355,13 +356,15 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
      addLegalFPImmediate(APFloat(+0.0)); // xorpd
      addLegalFPImmediate(APFloat(+0.0f)); // xorps
  
-    // Conversions to long double (in X87) go through memory.
-    setConvertAction(MVT::f32, MVT::f80, Expand);
-    setConvertAction(MVT::f64, MVT::f80, Expand);
-
-    // Conversions from long double (in X87) go through memory.
-    setConvertAction(MVT::f80, MVT::f32, Expand);
-    setConvertAction(MVT::f80, MVT::f64, Expand);
+    // Floating truncations from f80 and extensions to f80 go through memory.
+    // If optimizing, we lie about this though and handle it in
+    // InstructionSelectPreprocess so that dagcombine2 can hack on these.
+    if (Fast) {
+      setConvertAction(MVT::f32, MVT::f80, Expand);
+      setConvertAction(MVT::f64, MVT::f80, Expand);
+      setConvertAction(MVT::f80, MVT::f32, Expand);
+      setConvertAction(MVT::f80, MVT::f64, Expand);
+    }
    } else if (X86ScalarSSEf32) {
      // Use SSE for f32, x87 for f64.
      // Set up the FP register classes.
@@ -395,15 +398,17 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
      addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
      addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
  
-    // SSE->x87 conversions go through memory.
-    setConvertAction(MVT::f32, MVT::f64, Expand);
-    setConvertAction(MVT::f32, MVT::f80, Expand);
-
-    // x87->SSE truncations need to go through memory.
-    setConvertAction(MVT::f80, MVT::f32, Expand);    
-    setConvertAction(MVT::f64, MVT::f32, Expand);
-    // And x87->x87 truncations also.
-    setConvertAction(MVT::f80, MVT::f64, Expand);
+    // SSE <-> X87 conversions go through memory.  If optimizing, we lie about
+    // this though and handle it in InstructionSelectPreprocess so that
+    // dagcombine2 can hack on these.
+    if (Fast) {
+      setConvertAction(MVT::f32, MVT::f64, Expand);
+      setConvertAction(MVT::f32, MVT::f80, Expand);
+      setConvertAction(MVT::f80, MVT::f32, Expand);    
+      setConvertAction(MVT::f64, MVT::f32, Expand);
+      // And x87->x87 truncations also.
+      setConvertAction(MVT::f80, MVT::f64, Expand);
+    }
  
      if (!UnsafeFPMath) {
        setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
@@ -420,10 +425,14 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
      setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
  
-    // Floating truncations need to go through memory.
-    setConvertAction(MVT::f80, MVT::f32, Expand);    
-    setConvertAction(MVT::f64, MVT::f32, Expand);
-    setConvertAction(MVT::f80, MVT::f64, Expand);
+    // Floating truncations go through memory.  If optimizing, we lie about
+    // this though and handle it in InstructionSelectPreprocess so that
+    // dagcombine2 can hack on these.
+    if (Fast) {
+      setConvertAction(MVT::f80, MVT::f32, Expand);    
+      setConvertAction(MVT::f64, MVT::f32, Expand);
+      setConvertAction(MVT::f80, MVT::f64, Expand);
+    }
  
      if (!UnsafeFPMath) {
        setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
@@ -647,7 +656,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
      }
  
      setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-    
+
      // Custom lower v2i64 and v2f64 selects.
      setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
      setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
@@ -808,30 +817,10 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
      // a register.
      SDOperand Value = Op.getOperand(1);
      
-    // If this is an FP return with ScalarSSE, we need to move the value from
-    // an XMM register onto the fp-stack.
-    if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
-      SDOperand MemLoc;
-        
-      // If this is a load into a scalarsse value, don't store the loaded value
-      // back to the stack, only to reload it: just replace the scalar-sse load.
-      if (ISD::isNON_EXTLoad(Value.Val) &&
-           Chain.reachesChainWithoutSideEffects(Value.getOperand(0))) {
-        Chain  = Value.getOperand(0);
-        MemLoc = Value.getOperand(1);
-      } else {
-        // Spill the value to memory and reload it into top of stack.
-        unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8;
-        MachineFunction &MF = DAG.getMachineFunction();
-        int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
-        MemLoc = DAG.getFrameIndex(SSFI, getPointerTy());
-        Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0);
-      }
-      SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other);
-      SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())};
-      Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
-      Chain = Value.getValue(1);
-    }
+    // an XMM register onto the fp-stack.  Do this with an FP_EXTEND to f80.
+    // This will get legalized into a load/store if it can't get optimized away.
+    if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT()))
+      Value = DAG.getNode(ISD::FP_EXTEND, MVT::f80, Value);
      
      SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
      SDOperand Ops[] = { Chain, Value };
@@ -876,87 +865,26 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
      // Copies from the FP stack are special, as ST0 isn't a valid register
      // before the fp stackifier runs.
      
-    // Copy ST0 into an RFP register with FP_GET_RESULT.
-    SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag);
+    // Copy ST0 into an RFP register with FP_GET_RESULT.  If this will end up
+    // in an SSE register, copy it out as F80 and do a truncate, otherwise use
+    // the specified value type.
+    MVT::ValueType GetResultTy = RVLocs[0].getValVT();
+    if (isScalarFPTypeInSSEReg(GetResultTy))
+      GetResultTy = MVT::f80;
+    SDVTList Tys = DAG.getVTList(GetResultTy, MVT::Other, MVT::Flag);
+    
      SDOperand GROps[] = { Chain, InFlag };
      SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2);
      Chain  = RetVal.getValue(1);
      InFlag = RetVal.getValue(2);
-    
-    // If we are using ScalarSSE, store ST(0) to the stack and reload it into
-    // an XMM register.
-    if (isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
-      SDOperand StoreLoc;
-      const Value *SrcVal = 0;
-      int SrcValOffset = 0;
-      MVT::ValueType RetStoreVT = RVLocs[0].getValVT();
-      
-      // Determine where to store the value.  If the call result is directly
-      // used by a store, see if we can store directly into the location.  In
-      // this case, we'll end up producing a fst + movss[load] + movss[store] to
-      // the same location, and the two movss's will be nuked as dead.  This
-      // optimizes common things like "*D = atof(..)" to not need an
-      // intermediate stack slot.
-      if (SDOperand(TheCall, 0).hasOneUse() && 
-          SDOperand(TheCall, 1).hasOneUse()) {
-        // In addition to direct uses, we also support a FP_ROUND that uses the
-        // value, if it is directly stored somewhere.
-        SDNode *User = *TheCall->use_begin();
-        if (User->getOpcode() == ISD::FP_ROUND && User->hasOneUse())
-          User = *User->use_begin();
-        
-        // Ok, we have one use of the value and one use of the chain.  See if
-        // they are the same node: a store.
-        if (StoreSDNode *N = dyn_cast<StoreSDNode>(User)) {
-          // Verify that the value being stored is either the call or a
-          // truncation of the call.
-          SDNode *StoreVal = N->getValue().Val;
-          if (StoreVal == TheCall)
-            ; // ok.
-          else if (StoreVal->getOpcode() == ISD::FP_ROUND &&
-                   StoreVal->hasOneUse() && 
-                   StoreVal->getOperand(0).Val == TheCall)
-            ; // ok.
-          else
-            N = 0;  // not ok.
-            
-          if (N && N->getChain().Val == TheCall &&
-              !N->isVolatile() && !N->isTruncatingStore() && 
-              N->getAddressingMode() == ISD::UNINDEXED) {
-            StoreLoc = N->getBasePtr();
-            SrcVal = N->getSrcValue();
-            SrcValOffset = N->getSrcValueOffset();
-            RetStoreVT = N->getValue().getValueType();
-          }
-        }
-      }
  
-      // If we weren't able to optimize the result, just create a temporary
-      // stack slot.
-      if (StoreLoc.Val == 0) {
-        MachineFunction &MF = DAG.getMachineFunction();
-        int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
-        StoreLoc = DAG.getFrameIndex(SSFI, getPointerTy());
-      }
-      
-      // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
-      // shouldn't be necessary except that RFP cannot be live across
-      // multiple blocks (which could happen if a select gets lowered into
-      // multiple blocks and scheduled in between them). When stackifier is
-      // fixed, they can be uncoupled.
-      SDOperand Ops[] = {
-        Chain, RetVal, StoreLoc, DAG.getValueType(RetStoreVT), InFlag
-      };
-      Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5);
-      RetVal = DAG.getLoad(RetStoreVT, Chain,
-                           StoreLoc, SrcVal, SrcValOffset);
-      Chain = RetVal.getValue(1);
-      
-      // If we optimized a truncate, then extend the result back to its desired
-      // type.
-      if (RVLocs[0].getValVT() != RetStoreVT)
-        RetVal = DAG.getNode(ISD::FP_EXTEND, RVLocs[0].getValVT(), RetVal);
-    }
+    // If we want the result in an SSE register, use an FP_TRUNCATE to get it
+    // there.
+    if (GetResultTy != RVLocs[0].getValVT())
+      RetVal = DAG.getNode(ISD::FP_ROUND, RVLocs[0].getValVT(), RetVal,
+                           // This truncation won't change the value.
+                           DAG.getIntPtrConstant(1));
+    
      ResultVals.push_back(RetVal);
    }
    
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index a73d5afc2fccd74e99403deb3a36bf1ecd52e884..30d088c4ad59d8e41edbfe3101df1af7b6ad9bc7 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2734,6 +2734,14 @@ def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
  def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
  def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
  
+// extload f32 -> f64.  This matches load+fextend because we have a hack in 
+// the isel (PreprocessForFPConvert) that can introduce loads after dag combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+let Predicates = [HasSSE2] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+           (CVTSS2SDrm addr:$src)>;
+
  // Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
  // 16-bits matter.
  def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
diff --git a/test/CodeGen/X86/fp-stack-direct-ret.ll b/test/CodeGen/X86/fp-stack-direct-ret.ll

new file mode 100644 (file)

index 0000000..78be2a3
--- /dev/null
+++ b/test/CodeGen/X86/fp-stack-direct-ret.ll
@@ -0,0 +1,11 @@
+; RUN: llvm-as < %s | llc -march=x86 | not grep fstp
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | not grep movsd
+
+declare double @foo()
+
+define double @bar() {
+entry:
+       %tmp5 = tail call double @foo()
+       ret double %tmp5
+}
+
diff --git a/test/CodeGen/X86/fp-stack-ret-conv.ll b/test/CodeGen/X86/fp-stack-ret-conv.ll

new file mode 100644 (file)

index 0000000..5254e1c
--- /dev/null
+++ b/test/CodeGen/X86/fp-stack-ret-conv.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -mcpu=yonah | grep cvtss2sd
+; RUN: llvm-as < %s | llc -mcpu=yonah | grep fstps
+; RUN: llvm-as < %s | llc -mcpu=yonah | not grep cvtsd2ss
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "i686-apple-darwin8"
+
+define void @test(double *%b) {
+entry:
+       %tmp13 = tail call double @foo()
+       %tmp1314 = fptrunc double %tmp13 to float               ; <float> [#uses=1]
+       %tmp3940 = fpext float %tmp1314 to double               ; <double> [#uses=1]
+       volatile store double %tmp3940, double* %b
+       ret void
+}
+
+declare double @foo()
diff --git a/test/CodeGen/X86/pr1505b.ll b/test/CodeGen/X86/pr1505b.ll

index d5e3507583a50a0296034dc9ec304d3b45d59bf9..73cb23e7ed46fcbc6d6b57bc7f3fd74271fe923e 100644 (file)
--- a/test/CodeGen/X86/pr1505b.ll
+++ b/test/CodeGen/X86/pr1505b.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstpl | count 3
+; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstpl | count 4
  ; RUN: llvm-as < %s | llc -mcpu=i486 | grep fstps | count 3
  ; PR1505
author	Chris Lattner <sabre@nondot.org>
	Thu, 24 Jan 2008 08:07:48 +0000 (08:07 +0000)
committer	Chris Lattner <sabre@nondot.org>
	Thu, 24 Jan 2008 08:07:48 +0000 (08:07 +0000)
lib/Target/X86/X86ISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/fp-stack-direct-ret.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/fp-stack-ret-conv.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/pr1505b.ll		patch \| blob \| history