BUILD_VECTOR was missing out on some prime opportunities to use SSE 4.1 inserts.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index abc5d9bc1d5b8e7a7a72a75ea56fad4f122cc96b..960655806b7e325a5890fb5f9bc7864c8d02ee08 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,7 +16,6 @@
  #include "X86.h"
  #include "X86InstrBuilder.h"
  #include "X86ISelLowering.h"
-#include "X86MCTargetExpr.h"
  #include "X86TargetMachine.h"
  #include "X86TargetObjectFile.h"
  #include "llvm/CallingConv.h"
@@ -37,7 +36,7 @@
  #include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/MC/MCAsmInfo.h"
  #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCExpr.h"
  #include "llvm/MC/MCSymbol.h"
  #include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/SmallSet.h"
@@ -70,54 +69,10 @@ Disable16Bit("disable-16bit", cl::Hidden,
  static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                         SDValue V2);
  
-// FIXME: This is for a test.
-static cl::opt<bool>
-EnableX86EHTest("enable-x86-eh-test", cl::Hidden);
-
-namespace llvm {
-  class X86_test_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
-  public:
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM) {
-      TargetLoweringObjectFileMachO::Initialize(Ctx, TM);
-
-      // Exception Handling.
-      LSDASection = getMachOSection("__TEXT", "__gcc_except_tab", 0,
-                                    SectionKind::getReadOnlyWithRel());
-    }
-
-    virtual unsigned getTTypeEncoding() const {
-      return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-    }
-  };
-
-  class X8664_test_MachoTargetObjectFile : public X8664_MachoTargetObjectFile {
-  public:
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM) {
-      TargetLoweringObjectFileMachO::Initialize(Ctx, TM);
-
-      // Exception Handling.
-      LSDASection = getMachOSection("__TEXT", "__gcc_except_tab", 0,
-                                    SectionKind::getReadOnlyWithRel());
-    }
-
-    virtual unsigned getTTypeEncoding() const {
-      return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-    }
-  };
-}
-
  static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
    switch (TM.getSubtarget<X86Subtarget>().TargetType) {
    default: llvm_unreachable("unknown subtarget type");
    case X86Subtarget::isDarwin:
-    // FIXME: This is for an EH test.
-    if (EnableX86EHTest) {
-      if (TM.getSubtarget<X86Subtarget>().is64Bit())
-        return new X8664_test_MachoTargetObjectFile();
-      else
-        return new X86_test_MachoTargetObjectFile();
-    }
-
      if (TM.getSubtarget<X86Subtarget>().is64Bit())
        return new X8664_MachoTargetObjectFile();
      return new TargetLoweringObjectFileMachO();
@@ -1035,6 +990,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    // We have target-specific dag combine patterns for the following nodes:
    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
    setTargetDAGCombine(ISD::BUILD_VECTOR);
    setTargetDAGCombine(ISD::SELECT);
    setTargetDAGCombine(ISD::SHL);
@@ -1165,8 +1121,8 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
           Subtarget->isPICStyleGOT());
    // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
    // entries.
-  return X86MCTargetExpr::Create(MBB->getSymbol(Ctx),
-                                 X86MCTargetExpr::GOTOFF, Ctx);
+  return MCSymbolRefExpr::Create(MBB->getSymbol(),
+                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
  }
  
  /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
@@ -1524,7 +1480,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                          DebugLoc dl,
                                          SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals) {
-
    MachineFunction &MF = DAG.getMachineFunction();
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  
@@ -1826,7 +1781,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
  
    if (isTailCall) {
      // Check if it's really possible to do a tail call.
-    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
                                                     Outs, Ins, DAG);
  
      // Sibcalls are automatically detected tailcalls which do not require
@@ -2133,18 +2089,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                           OpFlags);
    }
  
-  if (isTailCall && !WasGlobalOrExternal) {
-    // Force the address into a (call preserved) caller-saved register since
-    // tailcall must happen after callee-saved registers are poped.
-    // FIXME: Give it a special register class that contains caller-saved
-    // register instead?
-    unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX;
-    Chain = DAG.getCopyToReg(Chain,  dl,
-                             DAG.getRegister(TCReg, getPointerTy()),
-                             Callee,InFlag);
-    Callee = DAG.getRegister(TCReg, getPointerTy());
-  }
-
    // Returns a chain & a flag for retval copy to use.
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    SmallVector<SDValue, 8> Ops;
@@ -2190,14 +2134,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
          if (RVLocs[i].isRegLoc())
            MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
      }
-
-    assert(((Callee.getOpcode() == ISD::Register &&
-               (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
-                cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) ||
-              Callee.getOpcode() == ISD::TargetExternalSymbol ||
-              Callee.getOpcode() == ISD::TargetGlobalAddress) &&
-           "Expecting a global address, external symbol, or scratch register");
-
      return DAG.getNode(X86ISD::TC_RETURN, dl,
                         NodeTys, &Ops[0], Ops.size());
    }
@@ -2344,6 +2280,8 @@ bool
  X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                       CallingConv::ID CalleeCC,
                                                       bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
                                                       SelectionDAG& DAG) const {
@@ -2363,10 +2301,37 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    // Look for obvious safe cases to perform tail call optimization that does not
    // requite ABI changes. This is what gcc calls sibcall.
  
-  // Do not tail call optimize vararg calls for now.
+  // Do not sibcall optimize vararg calls for now.
    if (isVarArg)
      return false;
  
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
+  // Therefore if it's not used by the call it is not safe to optimize this into
+  // a sibcall.
+  bool Unused = false;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (!Ins[i].Used) {
+      Unused = true;
+      break;
+    }
+  }
+  if (Unused) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CalleeCC, false, getTargetMachine(),
+                   RVLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+    for (unsigned i = 0; i != RVLocs.size(); ++i) {
+      CCValAssign &VA = RVLocs[i];
+      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+        return false;
+    }
+  }
+
    // If the callee takes no arguments then go on to check the results of the
    // call.
    if (!Outs.empty()) {
@@ -3648,6 +3613,54 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
    return SDValue();
  }
  
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+                                        DebugLoc &dl, SelectionDAG &DAG) {
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = Elts.size();
+  
+  // FIXME: check for zeroes
+  LoadSDNode *LDBase = NULL;
+  unsigned LastLoadedElt = -1U;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+    
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+                                       
+  if (LastLoadedElt == NumElems - 1) {
+    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+                       LDBase->isVolatile(), LDBase->isNonTemporal(),
+                       LDBase->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+  }
+  return SDValue();
+}
+
  SDValue
  X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    DebugLoc dl = Op.getDebugLoc();
@@ -3876,14 +3889,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
      return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
    }
  
-  if (Values.size() > 2) {
-    // If we have SSE 4.1, Expand into a number of inserts unless the number of
-    // values to be inserted is equal to the number of elements, in which case
-    // use the unpack code below in the hopes of matching the consecutive elts
-    // load merge pattern for shuffles.
-    // FIXME: We could probably just check that here directly.
-    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
-        getSubtarget()->hasSSE41()) {
+  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+    
+    // Check for elements which are consecutive loads.
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    if (LD.getNode())
+      return LD;
+    
+    // For SSE 4.1, use inserts into undef.  
+    if (getSubtarget()->hasSSE41()) {
        V[0] = DAG.getUNDEF(VT);
        for (unsigned i = 0; i < NumElems; ++i)
          if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
@@ -3891,7 +3908,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
                               Op.getOperand(i), DAG.getIntPtrConstant(i));
        return V[0];
      }
-    // Expand into a number of unpckl*.
+    
+    // Otherwise, expand into a number of unpckl*
      // e.g. for v4f32
      //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
      //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
@@ -3906,7 +3924,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
      }
      return V[0];
    }
-
    return SDValue();
  }
  
@@ -6217,7 +6234,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
            N2C && N2C->isNullValue() &&
            RHSC && RHSC->isNullValue()) {
          SDValue CmpOp0 = Cmp.getOperand(0);
-        Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(),
+        Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                            CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
          return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
                             DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
@@ -8498,6 +8515,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::CMOV_V4F32:
    case X86::CMOV_V2F64:
    case X86::CMOV_V2I64:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
      return EmitLoweredSelect(MI, BB, EM);
  
    case X86::FP32_TO_INT16_IN_MEM:
@@ -8827,82 +8849,104 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
    return TargetLowering::isGAPlusOffset(N, GA, Offset);
  }
  
-static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     EVT EltVT, LoadSDNode *&LDBase,
-                                     unsigned &LastLoadedElt,
-                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
-                                     const TargetLowering &TLI) {
-  LDBase = NULL;
-  LastLoadedElt = -1U;
-  for (unsigned i = 0; i < NumElems; ++i) {
-    if (N->getMaskElt(i) < 0) {
-      if (!LDBase)
-        return false;
-      continue;
-    }
-
-    SDValue Elt = DAG.getShuffleScalarElt(N, i);
-    if (!Elt.getNode() ||
-        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
-      return false;
-    if (!LDBase) {
-      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
-        return false;
-      LDBase = cast<LoadSDNode>(Elt.getNode());
-      LastLoadedElt = i;
-      continue;
-    }
-    if (Elt.getOpcode() == ISD::UNDEF)
-      continue;
-
-    LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
-      return false;
-    LastLoadedElt = i;
-  }
-  return true;
-}
-
  /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
  /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
  /// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.  In the case of v2i64, it will see if it can rewrite the
-/// shuffle to be an appropriate build vector so it can take advantage of
-// performBuildVectorCombine.
+/// order.
  static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                       const TargetLowering &TLI) {
    DebugLoc dl = N->getDebugLoc();
    EVT VT = N->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
-  unsigned NumElems = VT.getVectorNumElements();
  
    if (VT.getSizeInBits() != 128)
      return SDValue();
  
-  // Try to combine a vector_shuffle into a 128-bit load.
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  LoadSDNode *LD = NULL;
-  unsigned LastLoadedElt;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
-                                MFI, TLI))
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
+  
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+}
+
+/// PerformShuffleCombine - Detect vector gather/scatter index generation
+/// and convert it from being a bunch of shuffles and extracts to a simple
+/// store and scalar loads to extract the elements.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+                                                const TargetLowering &TLI) {
+  SDValue InputVector = N->getOperand(0);
+
+  // Only operate on vectors of 4 elements, where the alternative shuffling
+  // gets to be more expensive.
+  if (InputVector.getValueType() != MVT::v4i32)
      return SDValue();
  
-  if (LastLoadedElt == NumElems - 1) {
-    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
-      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                         LD->isVolatile(), LD->isNonTemporal(), 0);
-    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                       LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile(), LD->isNonTemporal(),
-                       LD->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1) {
-    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
-    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+  // single use which is a sign-extend or zero-extend, and all elements are
+  // used.
+  SmallVector<SDNode *, 4> Uses;
+  unsigned ExtractedElements = 0;
+  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+    if (UI.getUse().getResNo() != InputVector.getResNo())
+      return SDValue();
+
+    SDNode *Extract = *UI;
+    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    if (Extract->getValueType(0) != MVT::i32)
+      return SDValue();
+    if (!Extract->hasOneUse())
+      return SDValue();
+    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+      return SDValue();
+
+    // Record which element was extracted.
+    ExtractedElements |=
+      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+    Uses.push_back(Extract);
+  }
+
+  // If not all the elements were used, this may not be worthwhile.
+  if (ExtractedElements != 15)
+    return SDValue();
+
+  // Ok, we've now decided to do the transformation.
+  DebugLoc dl = InputVector.getDebugLoc();
+
+  // Store the value to a temporary stack slot.
+  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0,
+                            false, false, 0);
+
+  // Replace each use (extract) with a load of the appropriate element.
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+       UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
+
+    // Compute the element's address.
+    SDValue Idx = Extract->getOperand(1);
+    unsigned EltSize =
+        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
+    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr);
+
+    // Load the scalar.
+    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr,
+                          NULL, 0, false, false, 0);
+
+    // Replace the exact with the load.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
    }
+
+  // The replacement was made in place; don't return anything.
    return SDValue();
  }
  
@@ -9795,6 +9839,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    switch (N->getOpcode()) {
    default: break;
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
+  case ISD::EXTRACT_VECTOR_ELT:
+                        return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
    case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
    case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);