Add LLVMScalarOpts to LLVMPowerPCCodeGen.

[oota-llvm.git] / lib / Target / PowerPC / PPCISelLowering.cpp
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index 4e95e964097e4343f82072fca98205ab57a5a4ef..351356028d63e5f456a63295382e0c817d8b98c7 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -39,6 +39,10 @@
  #include "llvm/Target/TargetOptions.h"
  using namespace llvm;
  
+// FIXME: Remove this once soft-float is supported.
+static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic",
+cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden);
+
  static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
  cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
  
@@ -51,19 +55,10 @@ cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
  // FIXME: Remove this once the bug has been fixed!
  extern cl::opt<bool> ANDIGlueBug;
  
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  // If it isn't a Mach-O file then it's going to be a linux ELF
-  // object file.
-  if (TT.isOSDarwin())
-    return new TargetLoweringObjectFileMachO();
-
-  return new PPC64LinuxTargetObjectFile();
-}
-
-PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
+PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
+    : TargetLowering(TM),
        Subtarget(*TM.getSubtargetImpl()) {
-  setPow2DivIsCheap();
+  setPow2SDivIsCheap();
  
    // Use _setjmp/_longjmp instead of setjmp/longjmp.
    setUseUnderscoreSetJmp(true);
@@ -609,10 +604,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
    }
  
-  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+  if (!isPPC64) {
+    setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+  }
  
    setBooleanContents(ZeroOrOneBooleanContent);
    // Altivec instructions set fields to all zeros or all ones.
@@ -684,11 +679,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    if (Subtarget.isDarwin())
      setPrefFunctionAlignment(4);
  
-  if (isPPC64 && Subtarget.isJITCodeModel())
-    // Temporary workaround for the inability of PPC64 JIT to handle jump
-    // tables.
-    setSupportJumpTables(false);
-
    setInsertFencesForAtomic(true);
  
    if (Subtarget.enableMachineScheduler())
@@ -781,6 +771,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case PPCISD::SHL:             return "PPCISD::SHL";
    case PPCISD::CALL:            return "PPCISD::CALL";
    case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
+  case PPCISD::CALL_TLS:        return "PPCISD::CALL_TLS";
+  case PPCISD::CALL_NOP_TLS:    return "PPCISD::CALL_NOP_TLS";
    case PPCISD::MTCTR:           return "PPCISD::MTCTR";
    case PPCISD::BCTRL:           return "PPCISD::BCTRL";
    case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
@@ -810,10 +802,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
    case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
    case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
-  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
    case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
    case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
-  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
    case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
    case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
    case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
@@ -827,6 +817,11 @@ EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    return VT.changeVectorElementTypeToInteger();
  }
  
+bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+  return true;
+}
+
  //===----------------------------------------------------------------------===//
  // Node matching predicates, for use by the tblgen matching code.
  //===----------------------------------------------------------------------===//
@@ -858,20 +853,21 @@ static bool isConstantOrUndef(int Op, int Val) {
  /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
  bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                 SelectionDAG &DAG) {
+  bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
    if (ShuffleKind == 0) {
-    if (DAG.getTarget().getDataLayout()->isLittleEndian())
+    if (IsLE)
        return false;
      for (unsigned i = 0; i != 16; ++i)
        if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
          return false;
    } else if (ShuffleKind == 2) {
-    if (!DAG.getTarget().getDataLayout()->isLittleEndian())
+    if (!IsLE)
        return false;
      for (unsigned i = 0; i != 16; ++i)
        if (!isConstantOrUndef(N->getMaskElt(i), i*2))
          return false;
    } else if (ShuffleKind == 1) {
-    unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
+    unsigned j = IsLE ? 0 : 1;
      for (unsigned i = 0; i != 8; ++i)
        if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
            !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
@@ -888,22 +884,23 @@ bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
  /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
  bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                 SelectionDAG &DAG) {
+  bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
    if (ShuffleKind == 0) {
-    if (DAG.getTarget().getDataLayout()->isLittleEndian())
+    if (IsLE)
        return false;
      for (unsigned i = 0; i != 16; i += 2)
        if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
            !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
          return false;
    } else if (ShuffleKind == 2) {
-    if (!DAG.getTarget().getDataLayout()->isLittleEndian())
+    if (!IsLE)
        return false;
      for (unsigned i = 0; i != 16; i += 2)
        if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
            !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1))
          return false;
    } else if (ShuffleKind == 1) {
-    unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 2;
+    unsigned j = IsLE ? 0 : 2;
      for (unsigned i = 0; i != 8; i += 2)
        if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
            !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
@@ -942,7 +939,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
  /// the input operands are swapped (see PPCInstrAltivec.td).
  bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                               unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
      if (ShuffleKind == 1) // unary
        return isVMerge(N, UnitSize, 0, 0);
      else if (ShuffleKind == 2) // swapped
@@ -967,7 +964,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
  /// the input operands are swapped (see PPCInstrAltivec.td).
  bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
                               unsigned ShuffleKind, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
      if (ShuffleKind == 1) // unary
        return isVMerge(N, UnitSize, 8, 8);
      else if (ShuffleKind == 2) // swapped
@@ -987,7 +984,12 @@ bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
  
  /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
  /// amount, otherwise return -1.
-int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
+/// The ShuffleKind distinguishes between big-endian operations with two 
+/// different inputs (0), either-endian operations with two identical inputs
+/// (1), and little-endian operations with two different inputs (2).  For the
+/// latter, the input operands are swapped (see PPCInstrAltivec.td).
+int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                             SelectionDAG &DAG) {
    if (N->getValueType(0) != MVT::v16i8)
      return -1;
  
@@ -1005,38 +1007,26 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
    unsigned ShiftAmt = SVOp->getMaskElt(i);
    if (ShiftAmt < i) return -1;
  
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  ShiftAmt -= i;
+  bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()->
+    isLittleEndian();
  
-    ShiftAmt += i;
-
-    if (!isUnary) {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i))
-          return -1;
-    } else {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15))
-          return -1;
-    }
-
-  } else {  // Big Endian
+  if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else if (ShuffleKind == 1) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  } else
+    return -1;
  
-    ShiftAmt -= i;
+  if (ShuffleKind == 2 && isLE)
+    ShiftAmt = 16 - ShiftAmt;
  
-    if (!isUnary) {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
-          return -1;
-    } else {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
-          return -1;
-    }
-  }
    return ShiftAmt;
  }
  
@@ -1092,7 +1082,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
                                  SelectionDAG &DAG) {
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
    assert(isSplatShuffleMask(SVOp, EltSize));
-  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+  if (DAG.getSubtarget().getDataLayout()->isLittleEndian())
      return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
    else
      return SVOp->getMaskElt(0) / EltSize;
@@ -1639,8 +1629,16 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
  SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
    EVT PtrVT = Op.getValueType();
+  BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+  const BlockAddress *BA = BASDN->getBlockAddress();
  
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual BlockAddress is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(BASDN), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
  
    unsigned MOHiFlag, MOLoFlag;
    bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
@@ -1649,6 +1647,27 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
    return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
  }
  
+// Generate a call to __tls_get_addr for the given GOT entry Op.
+std::pair<SDValue,SDValue>
+PPCTargetLowering::lowerTLSCall(SDValue Op, SDLoc dl,
+                                SelectionDAG &DAG) const {
+
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Op;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, IntPtrTy,
+               DAG.getTargetExternalSymbol("__tls_get_addr", getPointerTy()),
+               std::move(Args), 0);
+
+  return LowerCallTo(CLI);
+}
+
  SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
  
@@ -1661,6 +1680,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
    const GlobalValue *GV = GA->getGlobal();
    EVT PtrVT = getPointerTy();
    bool is64bit = Subtarget.isPPC64();
+  const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+  PICLevel::Level picLevel = M->getPICLevel();
  
    TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
  
@@ -1692,64 +1713,46 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
    }
  
    if (Model == TLSModel::GeneralDynamic) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TLSGD);
      SDValue GOTPtr;
      if (is64bit) {
        SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
        GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
                                     GOTReg, TGA);
      } else {
-      GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+      if (picLevel == PICLevel::Small)
+        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+      else
+        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
      }
      SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
                                     GOTPtr, TGA);
-
-    // We need a chain node, and don't have one handy.  The underlying
-    // call has no side effects, so using the function entry node
-    // suffices.
-    SDValue Chain = DAG.getEntryNode();
-    Chain = DAG.getCopyToReg(Chain, dl,
-                             is64bit ? PPC::X3 : PPC::R3, GOTEntry);
-    SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3,
-                                      is64bit ? MVT::i64 : MVT::i32);
-    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
-                                  PtrVT, ParmReg, TGA);
-    // The return value from GET_TLS_ADDR really is in X3 already, but
-    // some hacks are needed here to tie everything together.  The extra
-    // copies dissolve during subsequent transforms.
-    Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr);
-    return DAG.getCopyFromReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, PtrVT);
+    std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
+    return CallResult.first;
    }
  
    if (Model == TLSModel::LocalDynamic) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TLSLD);
      SDValue GOTPtr;
      if (is64bit) {
        SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
        GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
                             GOTReg, TGA);
      } else {
-      GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+      if (picLevel == PICLevel::Small)
+        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+      else
+        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
      }
      SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
                                     GOTPtr, TGA);
-
-    // We need a chain node, and don't have one handy.  The underlying
-    // call has no side effects, so using the function entry node
-    // suffices.
-    SDValue Chain = DAG.getEntryNode();
-    Chain = DAG.getCopyToReg(Chain, dl,
-                             is64bit ? PPC::X3 : PPC::R3, GOTEntry);
-    SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3,
-                                      is64bit ? MVT::i64 : MVT::i32);
-    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
-                                  PtrVT, ParmReg, TGA);
-    // The return value from GET_TLSLD_ADDR really is in X3 already, but
-    // some hacks are needed here to tie everything together.  The extra
-    // copies dissolve during subsequent transforms.
-    Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr);
+    std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
+    SDValue TLSAddr = CallResult.first;
+    SDValue Chain = CallResult.second;
      SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
-                                      Chain, ParmReg, TGA);
+                                      Chain, TLSAddr, TGA);
      return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
    }
  
@@ -2313,7 +2316,8 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
  /// ensure minimum alignment required for target.
  static unsigned EnsureStackAlignment(const TargetMachine &Target,
                                       unsigned NumBytes) {
-  unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment();
+  unsigned TargetAlign =
+      Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment();
    unsigned AlignMask = TargetAlign - 1;
    NumBytes = (NumBytes + AlignMask) & ~AlignMask;
    return NumBytes;
@@ -2390,8 +2394,8 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    // Reserve space for the linkage area on the stack.
    unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false);
@@ -2465,7 +2469,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
    // caller's stack frame, right above the parameter list area.
    SmallVector<CCValAssign, 16> ByValArgLocs;
    CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      ByValArgLocs, *DAG.getContext());
  
    // Reserve stack space for the allocations in CCInfo.
    CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2498,7 +2502,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
        PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
        PPC::F8
      };
-    const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+    unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+    if (DisablePPCFloatInVariadic)
+      NumFPArgRegs = 0;
  
      FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
                                                            NumGPArgRegs));
@@ -2507,7 +2513,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
  
      // Make room for NumGPArgRegs and NumFPArgRegs.
      int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
-                NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
+                NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
  
      FuncInfo->setVarArgsStackOffset(
        MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
@@ -2549,7 +2555,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
                                     MachinePointerInfo(), false, false, 0);
        MemOps.push_back(Store);
        // Increment the address by eight for the next argument to store
-      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+      SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
                                           PtrVT);
        FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
      }
@@ -2698,7 +2704,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
        int FI;
        if (HasParameterArea ||
            ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
-        FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false);
+        FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true);
        else
          FI = MFI->CreateStackObject(ArgSize, Align, false);
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
@@ -3064,7 +3070,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
          CurArgOffset = CurArgOffset + (4 - ObjSize);
        }
        // The value of the object is its address.
-      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true);
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
        InVals.push_back(FIN);
        if (ObjSize==1 || ObjSize==2) {
@@ -3568,33 +3574,27 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
      }
  
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
-    // Use indirect calls for ALL functions calls in JIT mode, since the
-    // far-call stubs may be outside relocation limits for a BL instruction.
-    if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
-      unsigned OpFlags = 0;
-      if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          (Subtarget.getTargetTriple().isMacOSX() &&
-           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
-          (G->getGlobal()->isDeclaration() ||
-           G->getGlobal()->isWeakForLinker())) ||
-          (Subtarget.isTargetELF() && !isPPC64 &&
-           !G->getGlobal()->hasLocalLinkage() &&
-           DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
-        // PC-relative references to external symbols should go through $stub,
-        // unless we're building with the leopard linker or later, which
-        // automatically synthesizes these stubs.
-        OpFlags = PPCII::MO_PLT_OR_STUB;
-      }
-
-      // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
-      // every direct call is) turn it into a TargetGlobalAddress /
-      // TargetExternalSymbol node so that legalize doesn't hack it.
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
-                                          Callee.getValueType(),
-                                          0, OpFlags);
-      needIndirectCall = false;
+    unsigned OpFlags = 0;
+    if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+         (Subtarget.getTargetTriple().isMacOSX() &&
+          Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+         (G->getGlobal()->isDeclaration() ||
+          G->getGlobal()->isWeakForLinker())) ||
+        (Subtarget.isTargetELF() && !isPPC64 &&
+         !G->getGlobal()->hasLocalLinkage() &&
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = PPCII::MO_PLT_OR_STUB;
      }
+
+    // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+    // every direct call is) turn it into a TargetGlobalAddress /
+    // TargetExternalSymbol node so that legalize doesn't hack it.
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                        Callee.getValueType(), 0, OpFlags);
+    needIndirectCall = false;
    }
  
    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
@@ -3712,6 +3712,23 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
    if (Callee.getNode()) {
      Ops.push_back(Chain);
      Ops.push_back(Callee);
+
+    // If this is a call to __tls_get_addr, find the symbol whose address
+    // is to be taken and add it to the list.  This will be used to 
+    // generate __tls_get_addr(<sym>@tlsgd) or __tls_get_addr(<sym>@tlsld).
+    // We find the symbol by walking the chain to the CopyFromReg, walking
+    // back from the CopyFromReg to the ADDI_TLSGD_L or ADDI_TLSLD_L, and
+    // pulling the symbol from that node.
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+      if (!strcmp(S->getSymbol(), "__tls_get_addr")) {
+        assert(!needIndirectCall && "Indirect call to __tls_get_addr???");
+        SDNode *AddI = Chain.getNode()->getOperand(2).getNode();
+        SDValue TGTAddr = AddI->getOperand(1);
+        assert(TGTAddr.getNode()->getOpcode() == ISD::TargetGlobalTLSAddress &&
+               "Didn't find target global TLS address where we expected one");
+        Ops.push_back(TGTAddr);
+        CallOpc = PPCISD::CALL_TLS;
+      }
    }
    // If this is a tail call add stack pointer delta.
    if (isTailCall)
@@ -3747,8 +3764,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     SmallVectorImpl<SDValue> &InVals) const {
  
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext());
    CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
  
    // Copy all of the result registers out of their specified physreg.
@@ -3816,7 +3833,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
  
    // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
    const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
    assert(Mask && "Missing call preserved mask for calling convention");
    Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3863,7 +3881,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
                  DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
        // Otherwise insert NOP for non-local calls.
        CallOpc = PPCISD::CALL_NOP;
-    }
+    } else if (CallOpc == PPCISD::CALL_TLS)
+      // For 64-bit SVR4, TLS calls are always non-local.
+      CallOpc = PPCISD::CALL_NOP_TLS;
    }
  
    Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
@@ -3962,8 +3982,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
  
    // Assign locations to all of the outgoing arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
  
    // Reserve space for the linkage area on the stack.
    CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false),
@@ -4004,7 +4024,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
    // Assign locations to all of the outgoing aggregate by value arguments.
    SmallVector<CCValAssign, 16> ByValArgLocs;
    CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      ByValArgLocs, *DAG.getContext());
  
    // Reserve stack space for the allocations in CCInfo.
    CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -5040,8 +5060,7 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    LLVMContext &Context) const {
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
-                 RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
    return CCInfo.CheckReturn(Outs, RetCC_PPC);
  }
  
@@ -5053,8 +5072,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
                                 SDLoc dl, SelectionDAG &DAG) const {
  
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
  
    SDValue Flag;
@@ -6064,7 +6083,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
          PPC::isSplatShuffleMask(SVOp, 4) ||
          PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
          PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
-        PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 ||
+        PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
          PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
          PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
          PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
@@ -6081,7 +6100,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
    unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
    if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
        PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
-      PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 ||
+      PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
        PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
        PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
        PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
@@ -6542,11 +6561,44 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
  //  Other Lowering Code
  //===----------------------------------------------------------------------===//
  
+static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *Func = Intrinsic::getDeclaration(M, Id);
+  return Builder.CreateCall(Func);
+}
+
+// The mappings for emitLeading/TrailingFence is taken from
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                         AtomicOrdering Ord, bool IsStore,
+                                         bool IsLoad) const {
+  if (Ord == SequentiallyConsistent)
+    return callIntrinsic(Builder, Intrinsic::ppc_sync);
+  else if (isAtLeastRelease(Ord))
+    return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+  else
+    return nullptr;
+}
+
+Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                          AtomicOrdering Ord, bool IsStore,
+                                          bool IsLoad) const {
+  if (IsLoad && isAtLeastAcquire(Ord))
+    return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+  // FIXME: this is too conservative, a dependent branch + isync is enough.
+  // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+  // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+  // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+  else
+    return nullptr;
+}
+
  MachineBasicBlock *
  PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                      bool is64bit, unsigned BinOpcode) const {
    // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
  
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
    MachineFunction *F = BB->getParent();
@@ -6569,9 +6621,8 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
  
    MachineRegisterInfo &RegInfo = F->getRegInfo();
    unsigned TmpReg = (!BinOpcode) ? incr :
-    RegInfo.createVirtualRegister(
-       is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
-                 (const TargetRegisterClass *) &PPC::GPRCRegClass);
+    RegInfo.createVirtualRegister( is64bit ? &PPC::G8RCRegClass
+                                           : &PPC::GPRCRegClass);
  
    //  thisMBB:
    //   ...
@@ -6608,7 +6659,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
                                              bool is8bit,    // operation
                                              unsigned BinOpcode) const {
    // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    // In 64 bit mode we have to use 64 bits for addresses, even though the
    // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
    // registers without caring whether they're 32 or 64, but here we're
@@ -6636,9 +6688,8 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
  
    MachineRegisterInfo &RegInfo = F->getRegInfo();
-  const TargetRegisterClass *RC =
-    is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
-              (const TargetRegisterClass *) &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+                                          : &PPC::GPRCRegClass;
    unsigned PtrReg = RegInfo.createVirtualRegister(RC);
    unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
    unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
@@ -6736,7 +6787,8 @@ llvm::MachineBasicBlock*
  PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
    DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
  
    MachineFunction *MF = MBB->getParent();
    MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -6835,7 +6887,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
    // Setup
    MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
    const PPCRegisterInfo *TRI =
-    static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
+      getTargetMachine().getSubtarget<PPCSubtarget>().getRegisterInfo();
    MIB.addRegMask(TRI->getNoPreservedMask());
  
    BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
@@ -6884,7 +6936,8 @@ MachineBasicBlock *
  PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                       MachineBasicBlock *MBB) const {
    DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
  
    MachineFunction *MF = MBB->getParent();
    MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -6996,7 +7049,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return emitEHSjLjLongJmp(MI, BB);
    }
  
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
  
    // To "insert" these instructions we actually have to insert their
    // control-flow patterns.
@@ -7019,7 +7073,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      Cond.push_back(MI->getOperand(1));
  
      DebugLoc dl = MI->getDebugLoc();
-    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    const TargetInstrInfo *TII =
+        getTargetMachine().getSubtargetImpl()->getInstrInfo();
      TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
                        Cond, MI->getOperand(2).getReg(),
                        MI->getOperand(3).getReg());
@@ -7028,11 +7083,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
               MI->getOpcode() == PPC::SELECT_CC_F4 ||
               MI->getOpcode() == PPC::SELECT_CC_F8 ||
               MI->getOpcode() == PPC::SELECT_CC_VRRC ||
+             MI->getOpcode() == PPC::SELECT_CC_VSFRC ||
+             MI->getOpcode() == PPC::SELECT_CC_VSRC ||
               MI->getOpcode() == PPC::SELECT_I4 ||
               MI->getOpcode() == PPC::SELECT_I8 ||
               MI->getOpcode() == PPC::SELECT_F4 ||
               MI->getOpcode() == PPC::SELECT_F8 ||
-             MI->getOpcode() == PPC::SELECT_VRRC) {
+             MI->getOpcode() == PPC::SELECT_VRRC ||
+             MI->getOpcode() == PPC::SELECT_VSFRC ||
+             MI->getOpcode() == PPC::SELECT_VSRC) {
      // The incoming instruction knows the destination vreg to set, the
      // condition code register to branch on, the true/false values to
      // select between, and a branch opcode to use.
@@ -7063,7 +7122,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
          MI->getOpcode() == PPC::SELECT_I8 ||
          MI->getOpcode() == PPC::SELECT_F4 ||
          MI->getOpcode() == PPC::SELECT_F8 ||
-        MI->getOpcode() == PPC::SELECT_VRRC) {
+        MI->getOpcode() == PPC::SELECT_VRRC ||
+        MI->getOpcode() == PPC::SELECT_VSFRC ||
+        MI->getOpcode() == PPC::SELECT_VSRC) {
        BuildMI(BB, dl, TII->get(PPC::BC))
          .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
      } else {
@@ -7246,9 +7307,8 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      exitMBB->transferSuccessorsAndUpdatePHIs(BB);
  
      MachineRegisterInfo &RegInfo = F->getRegInfo();
-    const TargetRegisterClass *RC =
-      is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
-                (const TargetRegisterClass *) &PPC::GPRCRegClass;
+    const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+                                            : &PPC::GPRCRegClass;
      unsigned PtrReg = RegInfo.createVirtualRegister(RC);
      unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
      unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
@@ -7424,138 +7484,45 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  // Target Optimization Hooks
  //===----------------------------------------------------------------------===//
  
-SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
-                                               DAGCombinerInfo &DCI) const {
-  if (DCI.isAfterLegalizeVectorOps())
-    return SDValue();
-
-  EVT VT = Op.getValueType();
-
-  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
-      (VT == MVT::f64 && Subtarget.hasFRE())  ||
+SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
+                                            bool &UseOneConstNR) const {
+  EVT VT = Operand.getValueType();
+  if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
        (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
        (VT == MVT::v2f64 && Subtarget.hasVSX())) {
-
-    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
-    // For the reciprocal, we need to find the zero of the function:
-    //   F(X) = A X - 1 [which has a zero at X = 1/A]
-    //     =>
-    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
-    //     does not require additional intermediate precision]
-
      // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. The minimum architected relative
-    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
-    // 23 digits and double has 52 digits.
-    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+    // correct after every iteration. For both FRE and FRSQRTE, the minimum
+    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
+    // 2^-14. IEEE float has 23 digits and double has 52 digits.
+    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
      if (VT.getScalarType() == MVT::f64)
-      ++Iterations;
-
-    SelectionDAG &DAG = DCI.DAG;
-    SDLoc dl(Op);
-
-    SDValue FPOne =
-      DAG.getConstantFP(1.0, VT.getScalarType());
-    if (VT.isVector()) {
-      assert(VT.getVectorNumElements() == 4 &&
-             "Unknown vector type");
-      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                          FPOne, FPOne, FPOne, FPOne);
-    }
-
-    SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op);
-    DCI.AddToWorklist(Est.getNode());
-
-    // Newton iterations: Est = Est + Est (1 - Arg * Est)
-    for (int i = 0; i < Iterations; ++i) {
-      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst);
-      DCI.AddToWorklist(Est.getNode());
-    }
-
-    return Est;
+      ++RefinementSteps;
+    UseOneConstNR = true;
+    return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
    }
-
    return SDValue();
  }
  
-SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
-                                             DAGCombinerInfo &DCI) const {
-  if (DCI.isAfterLegalizeVectorOps())
-    return SDValue();
-
-  EVT VT = Op.getValueType();
-
-  if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
-      (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
+SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
+  EVT VT = Operand.getValueType();
+  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRE())  ||
        (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
        (VT == MVT::v2f64 && Subtarget.hasVSX())) {
-
-    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
-    // For the reciprocal sqrt, we need to find the zero of the function:
-    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
-    //     =>
-    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
-    // As a result, we precompute A/2 prior to the iteration loop.
-
      // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. The minimum architected relative
-    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
-    // 23 digits and double has 52 digits.
-    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+    // correct after every iteration. For both FRE and FRSQRTE, the minimum
+    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
+    // 2^-14. IEEE float has 23 digits and double has 52 digits.
+    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
      if (VT.getScalarType() == MVT::f64)
-      ++Iterations;
-
-    SelectionDAG &DAG = DCI.DAG;
-    SDLoc dl(Op);
-
-    SDValue FPThreeHalves =
-      DAG.getConstantFP(1.5, VT.getScalarType());
-    if (VT.isVector()) {
-      assert(VT.getVectorNumElements() == 4 &&
-             "Unknown vector type");
-      FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                                  FPThreeHalves, FPThreeHalves,
-                                  FPThreeHalves, FPThreeHalves);
-    }
-
-    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op);
-    DCI.AddToWorklist(Est.getNode());
-
-    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
-    // this entire sequence requires only one FP constant.
-    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op);
-    DCI.AddToWorklist(HalfArg.getNode());
-
-    HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op);
-    DCI.AddToWorklist(HalfArg.getNode());
-
-    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
-    for (int i = 0; i < Iterations; ++i) {
-      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
-      DCI.AddToWorklist(Est.getNode());
-    }
-
-    return Est;
+      ++RefinementSteps;
+    return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
    }
-
    return SDValue();
  }
  
@@ -7612,8 +7579,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
      default: return false;
      case Intrinsic::ppc_altivec_lvx:
      case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::ppc_vsx_lxvw4x:
        VT = MVT::v4i32;
        break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
      case Intrinsic::ppc_altivec_lvebx:
        VT = MVT::i8;
        break;
@@ -7634,8 +7605,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
      default: return false;
      case Intrinsic::ppc_altivec_stvx:
      case Intrinsic::ppc_altivec_stvxl:
+    case Intrinsic::ppc_vsx_stxvw4x:
        VT = MVT::v4i32;
        break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
      case Intrinsic::ppc_altivec_stvebx:
        VT = MVT::i8;
        break;
@@ -7671,7 +7646,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
    // nodes just above the top-level loads and token factors.
    while (!Queue.empty()) {
      SDNode *ChainNext = Queue.pop_back_val();
-    if (!Visited.insert(ChainNext))
+    if (!Visited.insert(ChainNext).second)
        continue;
  
      if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
@@ -7702,7 +7677,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
         
      while (!Queue.empty()) {
        SDNode *LoadRoot = Queue.pop_back_val();
-      if (!Visited.insert(LoadRoot))
+      if (!Visited.insert(LoadRoot).second)
          continue;
  
        if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
@@ -7832,7 +7807,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
      SDValue BinOp = BinOps.back();
      BinOps.pop_back();
  
-    if (!Visited.insert(BinOp.getNode()))
+    if (!Visited.insert(BinOp.getNode()).second)
        continue;
  
      PromOps.push_back(BinOp);
@@ -8046,7 +8021,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
      SDValue BinOp = BinOps.back();
      BinOps.pop_back();
  
-    if (!Visited.insert(BinOp.getNode()))
+    if (!Visited.insert(BinOp.getNode()).second)
        continue;
  
      PromOps.push_back(BinOp);
@@ -8283,92 +8258,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::SETCC:
    case ISD::SELECT_CC:
      return DAGCombineTruncBoolExt(N, DCI);
-  case ISD::FDIV: {
-    assert(TM.Options.UnsafeFPMath &&
-           "Reciprocal estimates require UnsafeFPMath");
-
-    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
-      SDValue RV =
-        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
-      if (RV.getNode()) {
-        DCI.AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                           N->getOperand(0), RV);
-      }
-    } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND &&
-               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
-      SDValue RV =
-        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
-                                 DCI);
-      if (RV.getNode()) {
-        DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
-                         N->getValueType(0), RV);
-        DCI.AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                           N->getOperand(0), RV);
-      }
-    } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND &&
-               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
-      SDValue RV =
-        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
-                                 DCI);
-      if (RV.getNode()) {
-        DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
-                         N->getValueType(0), RV,
-                         N->getOperand(1).getOperand(1));
-        DCI.AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                           N->getOperand(0), RV);
-      }
-    }
-
-    SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
-    if (RV.getNode()) {
-      DCI.AddToWorklist(RV.getNode());
-      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                         N->getOperand(0), RV);
-    }
-
-    }
-    break;
-  case ISD::FSQRT: {
-    assert(TM.Options.UnsafeFPMath &&
-           "Reciprocal estimates require UnsafeFPMath");
-
-    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
-    // reciprocal sqrt.
-    SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
-    if (RV.getNode()) {
-      DCI.AddToWorklist(RV.getNode());
-      RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode()) {
-        // Unfortunately, RV is now NaN if the input was exactly 0. Select out
-        // this case and force the answer to 0.
-
-        EVT VT = RV.getValueType();
-
-        SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
-        if (VT.isVector()) {
-          assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
-          Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
-        }
-
-        SDValue ZeroCmp =
-          DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
-                       N->getOperand(0), Zero, ISD::SETEQ);
-        DCI.AddToWorklist(ZeroCmp.getNode());
-        DCI.AddToWorklist(RV.getNode());
-
-        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
-                         ZeroCmp, Zero, RV);
-        return RV;
-      }
-    }
-
-    }
-    break;
    case ISD::SINT_TO_FP:
      if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
        if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
@@ -8459,6 +8348,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
      if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
          TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
+        // P8 and later hardware should just use LOAD.
+        !TM.getSubtarget<PPCSubtarget>().hasP8Vector() &&
          (VT == MVT::v16i8 || VT == MVT::v8i16 ||
           VT == MVT::v4i32 || VT == MVT::v4f32) &&
          LD->getAlignment() < ABIAlignment) {
@@ -8984,7 +8875,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
    // the AsmName field from *RegisterInfo.td, then this would not be necessary.
    if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
        PPC::GPRCRegClass.contains(R.first)) {
-    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const TargetRegisterInfo *TRI =
+        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
      return std::make_pair(TRI->getMatchingSuperReg(R.first,
                              PPC::sub_32, &PPC::G8RCRegClass),
                            &PPC::G8RCRegClass);
@@ -9197,6 +9089,92 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    return false;
  }
  
+bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+
+  switch (Intrinsic) {
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::ppc_altivec_lvebx:
+  case Intrinsic::ppc_altivec_lvehx:
+  case Intrinsic::ppc_altivec_lvewx:
+  case Intrinsic::ppc_vsx_lxvd2x:
+  case Intrinsic::ppc_vsx_lxvw4x: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_altivec_lvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_lvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_lvewx:
+      VT = MVT::i32;
+      break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = -VT.getStoreSize()+1;
+    Info.size = 2*VT.getStoreSize()-1;
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+  case Intrinsic::ppc_altivec_stvebx:
+  case Intrinsic::ppc_altivec_stvehx:
+  case Intrinsic::ppc_altivec_stvewx:
+  case Intrinsic::ppc_vsx_stxvd2x:
+  case Intrinsic::ppc_vsx_stxvw4x: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_altivec_stvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_stvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_stvewx:
+      VT = MVT::i32;
+      break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = -VT.getStoreSize()+1;
+    Info.size = 2*VT.getStoreSize()-1;
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
  /// getOptimalMemOpType - Returns the target specific optimal type for load
  /// and store operations as a result of memset, memcpy, and memmove
  /// lowering. If DstAlign is zero that means it's safe to destination
@@ -9274,7 +9252,8 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
  
    if (VT.getSimpleVT().isVector()) {
      if (Subtarget.hasVSX()) {
-      if (VT != MVT::v2f64 && VT != MVT::v2i64)
+      if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
+          VT != MVT::v4f32 && VT != MVT::v4i32)
          return false;
      } else {
        return false;