This patch fixes the Altivec addend construction for the fused multiply-add

[oota-llvm.git] / lib / Target / PowerPC / PPCISelLowering.cpp
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp

index c2825959553e8199e87125451dbbf72e5540c8e4..1c8524d3f99d6ef4f05644cbf04ebbe20d9fe16d 100644 (file)
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14,28 +14,26 @@
  #include "PPCISelLowering.h"
  #include "PPCMachineFunctionInfo.h"
  #include "PPCPerfectShuffle.h"
-#include "PPCPredicates.h"
  #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
  #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/VectorExtras.h"
  #include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/CodeGen/SelectionDAG.h"
  #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/Target/TargetOptions.h"
  using namespace llvm;
  
  static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
@@ -53,9 +51,11 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                                ISD::ArgFlagsTy &ArgFlags,
                                                CCState &State);
  
-static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
-cl::desc("enable preincrement load/store generation on PPC (experimental)"),
-                                     cl::Hidden);
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
  
  static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
    if (TM.getSubtargetImpl()->isDarwin())
@@ -66,6 +66,7 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
  
  PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
+  const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
  
    setPow2DivIsCheap();
  
@@ -75,12 +76,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
  
    // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
    // arguments are at least 4/8 bytes aligned.
-  setMinStackArgumentAlignment(TM.getSubtarget<PPCSubtarget>().isPPC64() ? 8:4);
+  bool isPPC64 = Subtarget->isPPC64();
+  setMinStackArgumentAlignment(isPPC64 ? 8:4);
  
    // Set up the register classes.
-  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
-  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
-  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
  
    // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -104,6 +106,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    // from FP_ROUND:  that rounds to nearest, this rounds to zero.
    setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
  
+  // We do not currently implement these libm ops for PowerPC.
+  setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
+
    // PowerPC has no SREM/UREM instructions
    setOperationAction(ISD::SREM, MVT::i32, Expand);
    setOperationAction(ISD::UREM, MVT::i32, Expand);
@@ -125,17 +134,17 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setOperationAction(ISD::FCOS , MVT::f64, Expand);
    setOperationAction(ISD::FREM , MVT::f64, Expand);
    setOperationAction(ISD::FPOW , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Legal);
    setOperationAction(ISD::FSIN , MVT::f32, Expand);
    setOperationAction(ISD::FCOS , MVT::f32, Expand);
    setOperationAction(ISD::FREM , MVT::f32, Expand);
    setOperationAction(ISD::FPOW , MVT::f32, Expand);
-  setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Legal);
  
    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
  
    // If we're enabling GP optimizations, use hardware square root
-  if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) {
+  if (!Subtarget->hasFSQRT()) {
      setOperationAction(ISD::FSQRT, MVT::f64, Expand);
      setOperationAction(ISD::FSQRT, MVT::f32, Expand);
    }
@@ -147,9 +156,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
    setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
    setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
    setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
    setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
    setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
  
    // PowerPC does not have ROTR
    setOperationAction(ISD::ROTR, MVT::i32   , Expand);
@@ -211,16 +224,29 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setOperationAction(ISD::TRAP, MVT::Other, Legal);
  
    // TRAMPOLINE is custom lowered.
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
  
    // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    setOperationAction(ISD::VASTART           , MVT::Other, Custom);
  
-  // VAARG is custom lowered with the 32-bit SVR4 ABI.
-  if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
-      && !TM.getSubtarget<PPCSubtarget>().isPPC64()) {
-    setOperationAction(ISD::VAARG, MVT::Other, Custom);
-    setOperationAction(ISD::VAARG, MVT::i64, Custom);
+  if (Subtarget->isSVR4ABI()) {
+    if (isPPC64) {
+      // VAARG always uses double-word chunks, so promote anything smaller.
+      setOperationAction(ISD::VAARG, MVT::i1, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i8, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i16, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i32, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::Other, Expand);
+    } else {
+      // VAARG is custom lowered with the 32-bit SVR4 ABI.
+      setOperationAction(ISD::VAARG, MVT::Other, Custom);
+      setOperationAction(ISD::VAARG, MVT::i64, Custom);
+    }
    } else
      setOperationAction(ISD::VAARG, MVT::Other, Expand);
  
@@ -249,7 +275,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
    setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
  
-  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+  if (Subtarget->has64BitSupport()) {
      // They also have instructions for converting between i64 and fp.
      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
@@ -268,9 +294,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
    }
  
-  if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
+  if (Subtarget->use64BitRegs()) {
      // 64-bit PowerPC implementations can support i64 types directly
-    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
      // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
      setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
      // 64-bit PowerPC wants to expand i128 shifts itself.
@@ -284,7 +310,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
    }
  
-  if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) {
+  if (Subtarget->hasAltivec()) {
      // First set operation action for all vector types to expand. Then we
      // will selectively turn on ones that can be effectively codegen'd.
      for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -321,6 +347,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
        setOperationAction(ISD::UREM, VT, Expand);
        setOperationAction(ISD::FDIV, VT, Expand);
        setOperationAction(ISD::FNEG, VT, Expand);
+      setOperationAction(ISD::FSQRT, VT, Expand);
+      setOperationAction(ISD::FLOG, VT, Expand);
+      setOperationAction(ISD::FLOG10, VT, Expand);
+      setOperationAction(ISD::FLOG2, VT, Expand);
+      setOperationAction(ISD::FEXP, VT, Expand);
+      setOperationAction(ISD::FEXP2, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FABS, VT, Expand);
+      setOperationAction(ISD::FPOWI, VT, Expand);
+      setOperationAction(ISD::FFLOOR, VT, Expand);
+      setOperationAction(ISD::FCEIL,  VT, Expand);
+      setOperationAction(ISD::FTRUNC, VT, Expand);
+      setOperationAction(ISD::FRINT,  VT, Expand);
+      setOperationAction(ISD::FNEARBYINT, VT, Expand);
        setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
        setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
        setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
@@ -332,7 +373,19 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
        setOperationAction(ISD::FPOW, VT, Expand);
        setOperationAction(ISD::CTPOP, VT, Expand);
        setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
        setOperationAction(ISD::CTTZ, VT, Expand);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
+        MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+        setTruncStoreAction(VT, InnerVT, Expand);
+      }
+      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
      }
  
      // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
@@ -345,13 +398,22 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
      setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
      setOperationAction(ISD::STORE , MVT::v4i32, Legal);
-
-    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
-    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
  
      setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
      setOperationAction(ISD::MUL, MVT::v4i32, Custom);
      setOperationAction(ISD::MUL, MVT::v8i16, Custom);
      setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -363,11 +425,28 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
      setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
      setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
      setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    // Altivec does not contain unordered floating-point compare instructions
+    setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
+  }
+
+  if (Subtarget->has64BitSupport()) {
+    setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
    }
  
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
    setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
  
-  if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+  if (isPPC64) {
      setStackPointerRegisterToSaveRestore(PPC::X1);
      setExceptionPointerRegister(PPC::X3);
      setExceptionSelectorRegister(PPC::X4);
@@ -384,7 +463,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    setTargetDAGCombine(ISD::BSWAP);
  
    // Darwin long double math library functions have $LDBL128 appended.
-  if (TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+  if (Subtarget->isDarwin()) {
      setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
      setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
      setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
@@ -401,7 +480,31 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
    if (PPCSubTarget.isDarwin())
      setPrefFunctionAlignment(4);
  
+  if (isPPC64 && Subtarget->isJITCodeModel())
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    setSupportJumpTables(false);
+
+  setInsertFencesForAtomic(true);
+
+  setSchedulingPreference(Sched::Hybrid);
+
    computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+    maxStoresPerMemset = 32;
+    maxStoresPerMemsetOptSize = 16;
+    maxStoresPerMemcpy = 32;
+    maxStoresPerMemcpyOptSize = 8;
+    maxStoresPerMemmove = 32;
+    maxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+    benefitFromCodePlacementOpt = true;
+  }
  }
  
  /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -411,7 +514,16 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
    // Darwin passes everything on 4 byte boundary.
    if (TM.getSubtarget<PPCSubtarget>().isDarwin())
      return 4;
-  // FIXME SVR4 TBD
+
+  // 16byte and wider vectors are passed on 16byte boundary.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    if (VTy->getBitWidth() >= 128)
+      return 16;
+
+  // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+   if (PPCSubTarget.isPPC64())
+     return 8;
+
    return 4;
  }
  
@@ -440,6 +552,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case PPCISD::EXTSW_32:        return "PPCISD::EXTSW_32";
    case PPCISD::STD_32:          return "PPCISD::STD_32";
    case PPCISD::CALL_SVR4:       return "PPCISD::CALL_SVR4";
+  case PPCISD::CALL_NOP_SVR4:   return "PPCISD::CALL_NOP_SVR4";
    case PPCISD::CALL_Darwin:     return "PPCISD::CALL_Darwin";
    case PPCISD::NOP:             return "PPCISD::NOP";
    case PPCISD::MTCTR:           return "PPCISD::MTCTR";
@@ -460,11 +573,18 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
    case PPCISD::MTFSF:           return "PPCISD::MTFSF";
    case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
+  case PPCISD::ADDIS_TOC_HA:    return "PPCISD::ADDIS_TOC_HA";
+  case PPCISD::LD_TOC_L:        return "PPCISD::LD_TOC_L";
+  case PPCISD::ADDI_TOC_L:      return "PPCISD::ADDI_TOC_L";
    }
  }
  
-MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i32;
+EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
  }
  
  //===----------------------------------------------------------------------===//
@@ -754,14 +874,13 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
    }
  
    // Properly sign extend the value.
-  int ShAmt = (4-ByteSize)*8;
-  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
  
    // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
    if (MaskVal == 0) return SDValue();
  
    // Finally, if this value fits in a 5 bit sext field, return it
-  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+  if (SignExtend32<5>(MaskVal) == MaskVal)
      return DAG.getTargetConstant(MaskVal, MVT::i32);
    return SDValue();
  }
@@ -815,14 +934,10 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
      APInt LHSKnownZero, LHSKnownOne;
      APInt RHSKnownZero, RHSKnownOne;
      DAG.ComputeMaskedBits(N.getOperand(0),
-                          APInt::getAllOnesValue(N.getOperand(0)
-                            .getValueSizeInBits()),
                            LHSKnownZero, LHSKnownOne);
  
      if (LHSKnownZero.getBoolValue()) {
        DAG.ComputeMaskedBits(N.getOperand(1),
-                            APInt::getAllOnesValue(N.getOperand(1)
-                              .getValueSizeInBits()),
                              RHSKnownZero, RHSKnownOne);
        // If all of the bits are known zero on the LHS or RHS, the add won't
        // carry.
@@ -861,10 +976,11 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
        return true; // [r+i]
      } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
        // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
               && "Cannot handle constant offsets yet!");
        Disp = N.getOperand(1).getOperand(0);  // The global address.
        assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
               Disp.getOpcode() == ISD::TargetConstantPool ||
               Disp.getOpcode() == ISD::TargetJumpTable);
        Base = N.getOperand(0);
@@ -877,10 +993,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
        // (for better address arithmetic) if the LHS and RHS of the OR are
        // provably disjoint.
        APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0),
-                            APInt::getAllOnesValue(N.getOperand(0)
-                                                   .getValueSizeInBits()),
-                            LHSKnownZero, LHSKnownOne);
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
  
        if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
          // If all of the bits are known zero on the LHS or RHS, the add won't
@@ -968,7 +1081,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
    if (N.getOpcode() == ISD::ADD) {
      short imm = 0;
      if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
        if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
          Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
        } else {
@@ -977,7 +1090,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
        return true; // [r+i]
      } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
        // Match LOAD (ADD (X, Lo(G))).
-     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
               && "Cannot handle constant offsets yet!");
        Disp = N.getOperand(1).getOperand(0);  // The global address.
        assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
@@ -993,10 +1106,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
        // (for better address arithmetic) if the LHS and RHS of the OR are
        // provably disjoint.
        APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0),
-                            APInt::getAllOnesValue(N.getOperand(0)
-                                                   .getValueSizeInBits()),
-                            LHSKnownZero, LHSKnownOne);
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
        if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
          // If all of the bits are known zero on the LHS or RHS, the add won't
          // carry.
@@ -1049,8 +1159,7 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
-  // Disabled by default for now.
-  if (!EnablePPCPreinc) return false;
+  if (DisablePPCPreinc) return false;
  
    SDValue Ptr;
    EVT VT;
@@ -1068,7 +1177,10 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
    if (VT.isVector())
      return false;
  
-  // TODO: Check reg+reg first.
+  if (SelectAddressRegReg(Ptr, Offset, Base, DAG)) {
+    AM = ISD::PRE_INC;
+    return true;
+  }
  
    // LDU/STU use reg+imm*4, others use reg+imm.
    if (VT != MVT::i64) {
@@ -1154,6 +1266,14 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
    const Constant *C = CP->getConstVal();
  
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
    unsigned MOHiFlag, MOLoFlag;
    bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
    SDValue CPIHi =
@@ -1167,6 +1287,14 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
    EVT PtrVT = Op.getValueType();
    JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
    unsigned MOHiFlag, MOLoFlag;
    bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
    SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -1182,11 +1310,35 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
  
    unsigned MOHiFlag, MOLoFlag;
    bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
-  SDValue TgtBAHi = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOHiFlag);
-  SDValue TgtBALo = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
    return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
  }
  
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_HA);
+  SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TPREL16_LO);
+
+  if (model != TLSModel::LocalExec)
+    llvm_unreachable("only local-exec TLS mode supported");
+  SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                   is64bit ? MVT::i64 : MVT::i32);
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+  return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+}
+
  SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
    EVT PtrVT = Op.getValueType();
@@ -1216,7 +1368,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
    // extra load to get the address of the global.
    if (MOHiFlag & PPCII::MO_NLP_FLAG)
      Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(),
-                      false, false, 0);
+                      false, false, false, 0);
    return Ptr;
  }
  
@@ -1312,11 +1464,13 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
  
    // areas
    SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr,
-                                     MachinePointerInfo(), false, false, 0);
+                                     MachinePointerInfo(), false, false,
+                                     false, 0);
    InChain = OverflowArea.getValue(1);
  
    SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr,
-                                    MachinePointerInfo(), false, false, 0);
+                                    MachinePointerInfo(), false, false,
+                                    false, 0);
    InChain = RegSaveArea.getValue(1);
  
    // select overflow_area if index > 8
@@ -1365,11 +1519,17 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                                MachinePointerInfo(),
                                MVT::i32, false, false, 0);
  
-  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), false, false, 0);
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
+                     false, false, false, 0);
  }
  
-SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
    SDValue Chain = Op.getOperand(0);
    SDValue Trmp = Op.getOperand(1); // trampoline
    SDValue FPtr = Op.getOperand(2); // nested function
@@ -1379,7 +1539,7 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    bool isPPC64 = (PtrVT == MVT::i64);
    Type *IntPtrTy =
-    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
+    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
                                                               *DAG.getContext());
  
    TargetLowering::ArgListTy Args;
@@ -1397,17 +1557,18 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
    Entry.Node = Nest; Args.push_back(Entry);
  
    // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Op.getValueType().getTypeForEVT(*DAG.getContext()),
-                false, false, false, false, 0, CallingConv::C, false,
-                /*isReturnValueUsed=*/true,
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                                       Type::getVoidTy(*DAG.getContext()),
+                                       false, false, false, false, 0,
+                                       CallingConv::C,
+                /*isTailCall=*/false,
+                                       /*doesNotRet=*/false,
+                                       /*isReturnValueUsed=*/true,
                  DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                  Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
  
-  SDValue Ops[] =
-    { CallResult.first, CallResult.second };
-
-  return DAG.getMergeValues(Ops, 2, dl);
+  return CallResult.second;
  }
  
  SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
@@ -1521,7 +1682,7 @@ static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
                                              CCValAssign::LocInfo &LocInfo,
                                              ISD::ArgFlagsTy &ArgFlags,
                                              CCState &State) {
-  static const unsigned ArgRegs[] = {
+  static const uint16_t ArgRegs[] = {
      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
      PPC::R7, PPC::R8, PPC::R9, PPC::R10,
    };
@@ -1548,7 +1709,7 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                                CCValAssign::LocInfo &LocInfo,
                                                ISD::ArgFlagsTy &ArgFlags,
                                                CCState &State) {
-  static const unsigned ArgRegs[] = {
+  static const uint16_t ArgRegs[] = {
      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
      PPC::F8
    };
@@ -1572,8 +1733,8 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
  
  /// GetFPR - Get the set of FP registers that should be allocated for arguments,
  /// on Darwin.
-static const unsigned *GetFPR() {
-  static const unsigned FPR[] = {
+static const uint16_t *GetFPR() {
+  static const uint16_t FPR[] = {
      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
      PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
    };
@@ -1601,9 +1762,13 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                          DebugLoc dl, SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals)
                                            const {
-  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
-    return LowerFormalArguments_SVR4(Chain, CallConv, isVarArg, Ins,
-                                     dl, DAG, InVals);
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+    else
+      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
    } else {
      return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
                                         dl, DAG, InVals);
@@ -1611,7 +1776,7 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
  }
  
  SDValue
-PPCTargetLowering::LowerFormalArguments_SVR4(
+PPCTargetLowering::LowerFormalArguments_32SVR4(
                                        SDValue Chain,
                                        CallingConv::ID CallConv, bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg>
@@ -1654,13 +1819,14 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
  
    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    // Potential tail calls could cause overwriting of argument stack slots.
-  bool isImmutable = !(GuaranteedTailCallOpt && (CallConv==CallingConv::Fast));
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
    unsigned PtrByteSize = 4;
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
  
    // Reserve space for the linkage area on the stack.
    CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -1672,26 +1838,26 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
  
      // Arguments stored in registers.
      if (VA.isRegLoc()) {
-      TargetRegisterClass *RC;
+      const TargetRegisterClass *RC;
        EVT ValVT = VA.getValVT();
  
        switch (ValVT.getSimpleVT().SimpleTy) {
          default:
            llvm_unreachable("ValVT not supported by formal arguments Lowering");
          case MVT::i32:
-          RC = PPC::GPRCRegisterClass;
+          RC = &PPC::GPRCRegClass;
            break;
          case MVT::f32:
-          RC = PPC::F4RCRegisterClass;
+          RC = &PPC::F4RCRegClass;
            break;
          case MVT::f64:
-          RC = PPC::F8RCRegisterClass;
+          RC = &PPC::F8RCRegClass;
            break;
          case MVT::v16i8:
          case MVT::v8i16:
          case MVT::v4i32:
          case MVT::v4f32:
-          RC = PPC::VRRCRegisterClass;
+          RC = &PPC::VRRCRegClass;
            break;
        }
  
@@ -1712,7 +1878,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
        InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
                                     MachinePointerInfo(),
-                                   false, false, 0));
+                                   false, false, false, 0));
      }
    }
  
@@ -1721,7 +1887,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
    // caller's stack frame, right above the parameter list area.
    SmallVector<CCValAssign, 16> ByValArgLocs;
    CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                     getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
  
    // Reserve stack space for the allocations in CCInfo.
    CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -1753,13 +1919,13 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
    // If the function takes variable number of arguments, make a frame index for
    // the start of the first vararg value... for expansion of llvm.va_start.
    if (isVarArg) {
-    static const unsigned GPArgRegs[] = {
+    static const uint16_t GPArgRegs[] = {
        PPC::R3, PPC::R4, PPC::R5, PPC::R6,
        PPC::R7, PPC::R8, PPC::R9, PPC::R10,
      };
      const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
  
-    static const unsigned FPArgRegs[] = {
+    static const uint16_t FPArgRegs[] = {
        PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
        PPC::F8
      };
@@ -1827,8 +1993,50 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
    return Chain;
  }
  
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
  SDValue
-PPCTargetLowering::LowerFormalArguments_Darwin(
+PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                                     SelectionDAG &DAG, SDValue ArgVal,
+                                     DebugLoc dl) const {
+  if (Flags.isSExt())
+    ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  else if (Flags.isZExt())
+    ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  
+  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+}
+
+// Set the size that is at least reserved in caller of this function.  Tail
+// call optimized functions' reserved stack space needs to be aligned so that
+// taking the difference between two stack areas will result in an aligned
+// stack.
+void
+PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                                      unsigned nAltivecParamsAtEnd,
+                                      unsigned MinReservedArea,
+                                      bool isPPC64) const {
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  // Add the Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+  unsigned TargetAlign
+    = DAG.getMachineFunction().getTarget().getFrameLowering()->
+        getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  FI->setMinReservedArea(MinReservedArea);
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_64SVR4(
                                        SDValue Chain,
                                        CallingConv::ID CallConv, bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg>
@@ -1842,94 +2050,41 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
    PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
  
    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  bool isPPC64 = PtrVT == MVT::i64;
    // Potential tail calls could cause overwriting of argument stack slots.
-  bool isImmutable = !(GuaranteedTailCallOpt && (CallConv==CallingConv::Fast));
-  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 8;
  
-  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
    // Area that is at least reserved in caller of this function.
    unsigned MinReservedArea = ArgOffset;
  
-  static const unsigned GPR_32[] = {           // 32-bit registers.
-    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
-    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
-  };
-  static const unsigned GPR_64[] = {           // 64-bit registers.
+  static const uint16_t GPR[] = {
      PPC::X3, PPC::X4, PPC::X5, PPC::X6,
      PPC::X7, PPC::X8, PPC::X9, PPC::X10,
    };
  
-  static const unsigned *FPR = GetFPR();
+  static const uint16_t *FPR = GetFPR();
  
-  static const unsigned VR[] = {
+  static const uint16_t VR[] = {
      PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
      PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
    };
  
-  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_GPR_Regs = array_lengthof(GPR);
    const unsigned Num_FPR_Regs = 13;
-  const unsigned Num_VR_Regs  = array_lengthof( VR);
+  const unsigned Num_VR_Regs  = array_lengthof(VR);
  
    unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
  
-  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
-
-  // In 32-bit non-varargs functions, the stack space for vectors is after the
-  // stack space for non-vectors.  We do not use this space unless we have
-  // too many vectors to fit in registers, something that only occurs in
-  // constructed examples:), but we have to walk the arglist to figure
-  // that out...for the pathological case, compute VecArgOffset as the
-  // start of the vector parameter area.  Computing VecArgOffset is the
-  // entire point of the following loop.
-  unsigned VecArgOffset = ArgOffset;
-  if (!isVarArg && !isPPC64) {
-    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
-         ++ArgNo) {
-      EVT ObjectVT = Ins[ArgNo].VT;
-      unsigned ObjSize = ObjectVT.getSizeInBits()/8;
-      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-
-      if (Flags.isByVal()) {
-        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
-        ObjSize = Flags.getByValSize();
-        unsigned ArgSize =
-                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
-        VecArgOffset += ArgSize;
-        continue;
-      }
-
-      switch(ObjectVT.getSimpleVT().SimpleTy) {
-      default: llvm_unreachable("Unhandled argument type!");
-      case MVT::i32:
-      case MVT::f32:
-        VecArgOffset += isPPC64 ? 8 : 4;
-        break;
-      case MVT::i64:  // PPC64
-      case MVT::f64:
-        VecArgOffset += 8;
-        break;
-      case MVT::v4f32:
-      case MVT::v4i32:
-      case MVT::v8i16:
-      case MVT::v16i8:
-        // Nothing to do, we're only looking at Nonvector args here.
-        break;
-      }
-    }
-  }
-  // We've found where the vector parameter area in memory is.  Skip the
-  // first 12 parameters; these don't use that memory.
-  VecArgOffset = ((VecArgOffset+15)/16)*16;
-  VecArgOffset += 12*16;
-
    // Add DAG nodes to load the arguments or copy them out of registers.  On
    // entry to a function on PPC, the arguments start after the linkage area,
    // although the first ones are often in registers.
  
    SmallVector<SDValue, 8> MemOps;
    unsigned nAltivecParamsAtEnd = 0;
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
      SDValue ArgVal;
      bool needsLoad = false;
      EVT ObjectVT = Ins[ArgNo].VT;
@@ -1942,12 +2097,13 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
      // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
      if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
          ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
-      if (isVarArg || isPPC64) {
+      if (isVarArg) {
          MinReservedArea = ((MinReservedArea+15)/16)*16;
          MinReservedArea += CalculateStackSlotSize(ObjectVT,
                                                    Flags,
                                                    PtrByteSize);
-      } else  nAltivecParamsAtEnd++;
+      } else
+        nAltivecParamsAtEnd++;
      } else
        // Calculate min reserved area.
        MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
@@ -1960,56 +2116,80 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
        // ObjSize is the true size, ArgSize rounded up to multiple of registers.
        ObjSize = Flags.getByValSize();
        ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
-      // Objects of size 1 and 2 are right justified, everything else is
-      // left justified.  This means the memory address is adjusted forwards.
-      if (ObjSize==1 || ObjSize==2) {
-        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      // Empty aggregate parameters do not take up registers.  Examples:
+      //   struct { } a;
+      //   union  { } b;
+      //   int c[0];
+      // etc.  However, we have to provide a place-holder in InVals, so
+      // pretend we have an 8-byte item at the current address for that
+      // purpose.
+      if (!ObjSize) {
+        int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+        InVals.push_back(FIN);
+        continue;
        }
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (ObjSize < PtrByteSize)
+        CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
        // The value of the object is its address.
        int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
        InVals.push_back(FIN);
-      if (ObjSize==1 || ObjSize==2) {
+
+      if (ObjSize < 8) {
          if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg;
-          if (isPPC64)
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-          else
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
            SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(),
-                                            ObjSize==1 ? MVT::i8 : MVT::i16,
-                                            false, false, 0);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                      MachinePointerInfo(FuncArg, CurArgOffset),
+                                      ObjType, false, false, 0);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.  The address of the parameter was already calculated
+            // above (InVals.push_back(FIN)) to be the right-justified
+            // offset within the slot.  For this store, we need a new
+            // frame index that points at the beginning of the slot.
+            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(FuncArg, ArgOffset),
+                                 false, false, 0);
+          }
+
            MemOps.push_back(Store);
            ++GPR_idx;
          }
-
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
          ArgOffset += PtrByteSize;
-
          continue;
        }
+
        for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
          // Store whatever pieces of the object are in registers
-        // to memory.  ArgVal will be address of the beginning of
-        // the object.
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
          if (GPR_idx != Num_GPR_Regs) {
            unsigned VReg;
-          if (isPPC64)
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-          else
-            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
            SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
            SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(),
+                                       MachinePointerInfo(FuncArg, ArgOffset),
                                         false, false, 0);
            MemOps.push_back(Store);
            ++GPR_idx;
            ArgOffset += PtrByteSize;
          } else {
-          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          ArgOffset += ArgSize - j;
            break;
          }
        }
@@ -2019,55 +2199,30 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
      switch (ObjectVT.getSimpleVT().SimpleTy) {
      default: llvm_unreachable("Unhandled argument type!");
      case MVT::i32:
-      if (!isPPC64) {
-        if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-          ++GPR_idx;
-        } else {
-          needsLoad = true;
-          ArgSize = PtrByteSize;
-        }
-        // All int arguments reserve stack space in the Darwin ABI.
-        ArgOffset += PtrByteSize;
-        break;
-      }
-      // FALLTHROUGH
-    case MVT::i64:  // PPC64
+    case MVT::i64:
        if (GPR_idx != Num_GPR_Regs) {
          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
  
-        if (ObjectVT == MVT::i32) {
+        if (ObjectVT == MVT::i32)
            // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
            // value to MVT::i64 and then truncate to the correct register size.
-          if (Flags.isSExt())
-            ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
-                                 DAG.getValueType(ObjectVT));
-          else if (Flags.isZExt())
-            ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
-                                 DAG.getValueType(ObjectVT));
-
-          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
-        }
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
  
          ++GPR_idx;
        } else {
          needsLoad = true;
          ArgSize = PtrByteSize;
        }
-      // All int arguments reserve stack space in the Darwin ABI.
        ArgOffset += 8;
        break;
  
      case MVT::f32:
      case MVT::f64:
-      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // Every 8 bytes of argument space consumes one of the GPRs available for
        // argument passing.
        if (GPR_idx != Num_GPR_Regs) {
          ++GPR_idx;
-        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
-          ++GPR_idx;
        }
        if (FPR_idx != Num_FPR_Regs) {
          unsigned VReg;
@@ -2081,10 +2236,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
          ++FPR_idx;
        } else {
          needsLoad = true;
+        ArgSize = PtrByteSize;
        }
  
-      // All FP arguments reserve stack space in the Darwin ABI.
-      ArgOffset += isPPC64 ? 8 : ObjSize;
+      ArgOffset += 8;
        break;
      case MVT::v4f32:
      case MVT::v4i32:
@@ -2106,53 +2261,34 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
          }
          ++VR_idx;
        } else {
-        if (!isVarArg && !isPPC64) {
-          // Vectors go after all the nonvectors.
-          CurArgOffset = VecArgOffset;
-          VecArgOffset += 16;
-        } else {
-          // Vectors are aligned.
-          ArgOffset = ((ArgOffset+15)/16)*16;
-          CurArgOffset = ArgOffset;
-          ArgOffset += 16;
-        }
+        // Vectors are aligned.
+        ArgOffset = ((ArgOffset+15)/16)*16;
+        CurArgOffset = ArgOffset;
+        ArgOffset += 16;
          needsLoad = true;
        }
        break;
      }
  
-    // We need to load the argument to a virtual register if we determined above
-    // that we ran out of physical registers of the appropriate type.
+    // We need to load the argument to a virtual register if we determined
+    // above that we ran out of physical registers of the appropriate type.
      if (needsLoad) {
        int FI = MFI->CreateFixedObject(ObjSize,
                                        CurArgOffset + (ArgSize - ObjSize),
                                        isImmutable);
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
        ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
-                           false, false, 0);
+                           false, false, false, 0);
      }
  
      InVals.push_back(ArgVal);
    }
  
    // Set the size that is at least reserved in caller of this function.  Tail
-  // call optimized function's reserved stack space needs to be aligned so that
+  // call optimized functions' reserved stack space needs to be aligned so that
    // taking the difference between two stack areas will result in an aligned
    // stack.
-  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-  // Add the Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    MinReservedArea = ((MinReservedArea+15)/16)*16;
-    MinReservedArea += 16*nAltivecParamsAtEnd;
-  }
-  MinReservedArea =
-    std::max(MinReservedArea,
-             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
-  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
-    getStackAlignment();
-  unsigned AlignMask = TargetAlign-1;
-  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
-  FI->setMinReservedArea(MinReservedArea);
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true);
  
    // If the function takes variable number of arguments, make a frame index for
    // the start of the first vararg value... for expansion of llvm.va_start.
@@ -2160,27 +2296,20 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
      int Depth = ArgOffset;
  
      FuncInfo->setVarArgsFrameIndex(
-      MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
-                             Depth, true));
+      MFI->CreateFixedObject(PtrByteSize, Depth, true));
      SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
  
      // If this function is vararg, store any remaining integer argument regs
      // to their spots on the stack so that they may be loaded by deferencing the
      // result of va_next.
      for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
-      unsigned VReg;
-
-      if (isPPC64)
-        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-      else
-        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
-
+      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                     MachinePointerInfo(), false, false, 0);
        MemOps.push_back(Store);
        // Increment the address by four for the next argument to store
-      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT);
        FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
      }
    }
@@ -2192,32 +2321,380 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
    return Chain;
  }
  
-/// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
-/// linkage area for the Darwin ABI.
-static unsigned
-CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
-                                     bool isPPC64,
-                                     bool isVarArg,
-                                     unsigned CC,
-                                     const SmallVectorImpl<ISD::OutputArg>
-                                       &Outs,
-                                     const SmallVectorImpl<SDValue> &OutVals,
-                                     unsigned &nAltivecParamsAtEnd) {
-  // Count how many bytes are to be pushed on the stack, including the linkage
-  // area, and parameter passing area.  We start with 24/48 bytes, which is
-  // prereserved space for [SP][CR][LR][3 x unused].
-  unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true);
-  unsigned NumOps = Outs.size();
-  unsigned PtrByteSize = isPPC64 ? 8 : 4;
-
-  // Add up all the space actually used.
-  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
-  // they all go in registers, but we must reserve stack space for them for
-  // possible use by the caller.  In varargs or 64-bit calls, parameters are
-  // assigned stack space in order, with padding so Altivec parameters are
-  // 16-byte aligned.
-  nAltivecParamsAtEnd = 0;
-  for (unsigned i = 0; i != NumOps; ++i) {
+SDValue
+PPCTargetLowering::LowerFormalArguments_Darwin(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const uint16_t GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const uint16_t GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof( VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  // In 32-bit non-varargs functions, the stack space for vectors is after the
+  // stack space for non-vectors.  We do not use this space unless we have
+  // too many vectors to fit in registers, something that only occurs in
+  // constructed examples:), but we have to walk the arglist to figure
+  // that out...for the pathological case, compute VecArgOffset as the
+  // start of the vector parameter area.  Computing VecArgOffset is the
+  // entire point of the following loop.
+  unsigned VecArgOffset = ArgOffset;
+  if (!isVarArg && !isPPC64) {
+    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
+         ++ArgNo) {
+      EVT ObjectVT = Ins[ArgNo].VT;
+      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+      if (Flags.isByVal()) {
+        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+        unsigned ObjSize = Flags.getByValSize();
+        unsigned ArgSize =
+                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+        VecArgOffset += ArgSize;
+        continue;
+      }
+
+      switch(ObjectVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unhandled argument type!");
+      case MVT::i32:
+      case MVT::f32:
+        VecArgOffset += 4;
+        break;
+      case MVT::i64:  // PPC64
+      case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
+        VecArgOffset += 8;
+        break;
+      case MVT::v4f32:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        // Nothing to do, we're only looking at Nonvector args here.
+        break;
+      }
+    }
+  }
+  // We've found where the vector parameter area in memory is.  Skip the
+  // first 12 parameters; these don't use that memory.
+  VecArgOffset = ((VecArgOffset+15)/16)*16;
+  VecArgOffset += 12*16;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg || isPPC64) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else  nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Objects of size 1 and 2 are right justified, everything else is
+      // left justified.  This means the memory address is adjusted forwards.
+      if (ObjSize==1 || ObjSize==2) {
+        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      }
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+      if (ObjSize==1 || ObjSize==2) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
+          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                            MachinePointerInfo(FuncArg,
+                                              CurArgOffset),
+                                            ObjType, false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+
+        ArgOffset += PtrByteSize;
+
+        continue;
+      }
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+      if (!isPPC64) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+          ++GPR_idx;
+        } else {
+          needsLoad = true;
+          ArgSize = PtrByteSize;
+        }
+        // All int arguments reserve stack space in the Darwin ABI.
+        ArgOffset += PtrByteSize;
+        break;
+      }
+      // FALLTHROUGH
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // All int arguments reserve stack space in the Darwin ABI.
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+
+      // All FP arguments reserve stack space in the Darwin ABI.
+      ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        if (!isVarArg && !isPPC64) {
+          // Vectors go after all the nonvectors.
+          CurArgOffset = VecArgOffset;
+          VecArgOffset += 16;
+        } else {
+          // Vectors are aligned.
+          ArgOffset = ((ArgOffset+15)/16)*16;
+          CurArgOffset = ArgOffset;
+          ArgOffset += 16;
+        }
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                             Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+
+      if (isPPC64)
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      else
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
+/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus
+/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI.
+static unsigned
+CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
+                                     bool isPPC64,
+                                     bool isVarArg,
+                                     unsigned CC,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     unsigned &nAltivecParamsAtEnd) {
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned NumOps = Outs.size();
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
      EVT ArgVT = Outs[i].VT;
      // Varargs Altivec parameters are padded to a 16 byte boundary.
@@ -2250,9 +2727,9 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                        PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
  
    // Tail call needs the stack to be aligned.
-  if (CC==CallingConv::Fast && GuaranteedTailCallOpt) {
-    unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
-      getStackAlignment();
+  if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){
+    unsigned TargetAlign = DAG.getMachineFunction().getTarget().
+      getFrameLowering()->getStackAlignment();
      unsigned AlignMask = TargetAlign-1;
      NumBytes = (NumBytes + AlignMask) & ~AlignMask;
    }
@@ -2286,7 +2763,7 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                       bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg> &Ins,
                                                       SelectionDAG& DAG) const {
-  if (!GuaranteedTailCallOpt)
+  if (!getTargetMachine().Options.GuaranteedTailCallOpt)
      return false;
  
    // Variable argument functions are not supported.
@@ -2324,7 +2801,7 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
  
    int Addr = C->getZExtValue();
    if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
-      (Addr << 6 >> 6) != Addr)
+      SignExtend32<26>(Addr) != Addr)
      return 0;  // Top 6 bits have to be sext of immediate.
  
    return DAG.getConstant((int)C->getZExtValue() >> 2,
@@ -2434,7 +2911,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
      EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
      LROpOut = getReturnAddrFrameIndex(DAG);
      LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
-                          false, false, 0);
+                          false, false, false, 0);
      Chain = SDValue(LROpOut.getNode(), 1);
  
      // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
@@ -2442,7 +2919,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
      if (isDarwinABI) {
        FPOpOut = getFramePointerFrameIndex(DAG);
        FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(),
-                            false, false, 0);
+                            false, false, false, 0);
        Chain = SDValue(FPOpOut.getNode(), 1);
      }
    }
@@ -2602,7 +3079,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
        // Thus for a call through a function pointer, the following actions need
        // to be performed:
        //   1. Save the TOC of the caller in the TOC save area of its stack
-      //      frame (this is done in LowerCall_Darwin()).
+      //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
        //   2. Load the address of the function entry point from the function
        //      descriptor.
        //   3. Load the TOC of the callee from the function descriptor into r2.
@@ -2692,6 +3169,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
    return CallOpc;
  }
  
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return !G->getGlobal()->isDeclaration() &&
+           !G->getGlobal()->isWeakForLinker();
+  return false;
+}
+
  SDValue
  PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     CallingConv::ID CallConv, bool isVarArg,
@@ -2701,18 +3187,38 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
  
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), RVLocs, *DAG.getContext());
+                    getTargetMachine(), RVLocs, *DAG.getContext());
    CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
  
    // Copy all of the result registers out of their specified physreg.
    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
      CCValAssign &VA = RVLocs[i];
-    EVT VT = VA.getValVT();
      assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyFromReg(Chain, dl,
-                               VA.getLocReg(), VT, InFlag).getValue(1);
-    InVals.push_back(Chain.getValue(0));
-    InFlag = Chain.getValue(2);
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
    }
  
    return Chain;
@@ -2735,11 +3241,22 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
                                   isTailCall, RegsToPass, Ops, NodeTys,
                                   PPCSubTarget);
  
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
    // When performing tail call optimization the callee pops its arguments off
    // the stack. Account for this here so these bytes can be pushed back on in
    // PPCRegisterInfo::eliminateCallFramePseudoInstr.
    int BytesCalleePops =
-    (CallConv==CallingConv::Fast && GuaranteedTailCallOpt) ? NumBytes : 0;
+    (CallConv == CallingConv::Fast &&
+     getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
  
    if (InFlag.getNode())
      Ops.push_back(InFlag);
@@ -2751,7 +3268,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
      if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
        SmallVector<CCValAssign, 16> RVLocs;
        CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext());
+                     getTargetMachine(), RVLocs, *DAG.getContext());
        CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
        for (unsigned i = 0; i != RVLocs.size(); ++i)
          DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
@@ -2767,9 +3284,6 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
      return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
    }
  
-  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
-
    // Add a NOP immediately after the branch instruction when using the 64-bit
    // SVR4 ABI. At link time, if caller and callee are in a different module and
    // thus have a different TOC, the call will be replaced with a call to a stub
@@ -2778,8 +3292,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
    // which restores the TOC of the caller from the TOC save slot of the current
    // stack frame. If caller and callee belong to the same module (and have the
    // same TOC), the NOP will remain unchanged.
+
+  bool needsTOCRestore = false;
    if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
-    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
      if (CallOpc == PPCISD::BCTRL_SVR4) {
        // This is a call through a function pointer.
        // Restore the caller TOC from the save area into R2.
@@ -2790,14 +3305,22 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
        // since r2 is a reserved register (which prevents the register allocator
        // from allocating it), resulting in an additional register being
        // allocated and an unnecessary move instruction being generated.
-      Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
-      InFlag = Chain.getValue(1);
-    } else {
-      // Otherwise insert NOP.
-      InFlag = DAG.getNode(PPCISD::NOP, dl, MVT::Glue, InFlag);
+      needsTOCRestore = true;
+    } else if ((CallOpc == PPCISD::CALL_SVR4) && !isLocalCall(Callee)) {
+      // Otherwise insert NOP for non-local calls.
+      CallOpc = PPCISD::CALL_NOP_SVR4;
      }
    }
  
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  if (needsTOCRestore) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                               DAG.getIntPtrConstant(BytesCalleePops, true),
                               InFlag);
@@ -2809,22 +3332,33 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
  }
  
  SDValue
-PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
    if (isTailCall)
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                     Ins, DAG);
  
-  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
-    return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, OutVals, Ins,
-                          dl, DAG, InVals);
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+    else
+      return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+  }
  
    return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
                            isTailCall, Outs, OutVals, Ins,
@@ -2832,15 +3366,15 @@ PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
  }
  
  SDValue
-PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  bool isTailCall,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const {
-  // See PPCTargetLowering::LowerFormalArguments_SVR4() for a description
+PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
    // of the 32-bit SVR4 ABI stack frame layout.
  
    assert((CallConv == CallingConv::C ||
@@ -2855,7 +3389,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    // and restoring the callers stack pointer in this functions epilog. This is
    // done because by tail calling the called function might overwrite the value
    // in this function's (MF) stack pointer stack slot 0(SP).
-  if (GuaranteedTailCallOpt && CallConv==CallingConv::Fast)
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
      MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
  
    // Count how many bytes are to be pushed on the stack, including the linkage
@@ -2865,7 +3400,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    // Assign locations to all of the outgoing arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), ArgLocs, *DAG.getContext());
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
  
    // Reserve space for the linkage area on the stack.
    CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -2905,7 +3440,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    // Assign locations to all of the outgoing aggregate by value arguments.
    SmallVector<CCValAssign, 16> ByValArgLocs;
    CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                     getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
  
    // Reserve stack space for the allocations in CCInfo.
    CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2941,6 +3476,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
    SmallVector<SDValue, 8> MemOpChains;
  
+  bool seenFloatArg = false;
    // Walk the register/memloc assignments, inserting copies/loads.
    for (unsigned i = 0, j = 0, e = ArgLocs.size();
         i != e;
@@ -2950,60 +3486,443 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
  
      if (Flags.isByVal()) {
-      // Argument is an aggregate which is passed by value, thus we need to
-      // create a copy of it in the local variable space of the current stack
-      // frame (which is the stack frame of the caller) and pass the address of
-      // this copy to the callee.
-      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
-      CCValAssign &ByValVA = ByValArgLocs[j++];
-      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+      // Argument is an aggregate which is passed by value, thus we need to
+      // create a copy of it in the local variable space of the current stack
+      // frame (which is the stack frame of the caller) and pass the address of
+      // this copy to the callee.
+      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+      CCValAssign &ByValVA = ByValArgLocs[j++];
+      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+
+      // Memory reserved in the local variable space of the callers stack frame.
+      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+      // Create a copy of the argument in the local area of the current
+      // stack frame.
+      SDValue MemcpyCall =
+        CreateCopyOfByValArgument(Arg, PtrOff,
+                                  CallSeqStart.getNode()->getOperand(0),
+                                  Flags, DAG, dl);
+
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                             NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+
+      // Pass the address of the aggregate copy on the stack either in a
+      // physical register or in the parameter list area of the current stack
+      // frame to the callee.
+      Arg = PtrOff;
+    }
+
+    if (VA.isRegLoc()) {
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
+      // Put argument in a physical register.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Put argument in the parameter list area of the current stack frame.
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+
+      if (!isTailCall) {
+        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+      } else {
+        // Calculate and remember argument location.
+        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+                                 TailCallArguments);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
+                    false, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue
+PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                              SDValue CallSeqStart,
+                                              ISD::ArgFlagsTy Flags,
+                                              SelectionDAG &DAG,
+                                              DebugLoc dl) const {
+  SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                        CallSeqStart.getNode()->getOperand(0),
+                        Flags, DAG, dl);
+  // The MEMCPY must go outside the CALLSEQ_START..END.
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                             CallSeqStart.getNode()->getOperand(1));
+  DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                         NewCallSeqStart.getNode());
+  return NewCallSeqStart;
+}
+
+SDValue
+PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned PtrByteSize = 8;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with at least 48 bytes, which
+  // is reserved space for [SP][CR][LR][3 x unused].
+  // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result
+  // of this call.
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv,
+                                         Outs, OutVals, nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // Promote integers to 64-bit values.
+    if (Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register.
+      unsigned Size = Flags.getByValSize();
  
-      // Memory reserved in the local variable space of the callers stack frame.
-      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+      // An empty aggregate parameter takes up no storage and no
+      // registers.
+      if (Size == 0)
+        continue;
  
-      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
-      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (Size==1 || Size==2 || Size==4) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
  
-      // Create a copy of the argument in the local area of the current
-      // stack frame.
-      SDValue MemcpyCall =
-        CreateCopyOfByValArgument(Arg, PtrOff,
-                                  CallSeqStart.getNode()->getOperand(0),
-                                  Flags, DAG, dl);
+          ArgOffset += PtrByteSize;
+          continue;
+        }
+      }
  
-      // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
-      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
-                             NewCallSeqStart.getNode());
-      Chain = CallSeqStart = NewCallSeqStart;
+      if (GPR_idx == NumGPRs && Size < 8) {
+        SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                        PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
  
-      // Pass the address of the aggregate copy on the stack either in a
-      // physical register or in the parameter list area of the current stack
-      // frame to the callee.
-      Arg = PtrOff;
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  All arguments must be copied into the parameter area BY
+      // THE CALLEE in the event that the callee takes the address of any
+      // formal argument.  That has not yet been implemented.  However, it is
+      // reasonable to use the stack area as a staging area for the register
+      // load.
+
+      // Skip this for small aggregates, as we will use the same slot for a
+      // right-justified copy, below.
+      if (Size >= 8)
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+      // When a register is available, pass a small aggregate right-justified.
+      if (Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the 
+        // parameter save area instead of a new local variable.
+        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+        // Load the slot into the register.
+        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For aggregates larger than PtrByteSize, copy the pieces of the
+      // object that fit into registers from the parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
      }
  
-    if (VA.isRegLoc()) {
-      // Put argument in a physical register.
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      // Put argument in the parameter list area of the current stack frame.
-      assert(VA.isMemLoc());
-      unsigned LocMemOffset = VA.getLocMemOffset();
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
  
-      if (!isTailCall) {
-        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
-        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+        if (isVarArg) {
+          // A single float or an aggregate containing only a single float
+          // must be passed right-justified in the stack doubleword, and
+          // in the GPR, if one is available.
+          SDValue StoreOff;
+          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+          } else
+            StoreOff = PtrOff;
  
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
+          SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
+                                       MachinePointerInfo(), false, false, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else if (GPR_idx != NumGPRs)
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          ++GPR_idx;
        } else {
-        // Calculate and remember argument location.
-        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
-                                 TailCallArguments);
+        // Single-precision floating-point values are mapped to the
+        // second (rightmost) word of the stack doubleword.
+        if (Arg.getValueType() == MVT::f32) {
+          SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+        }
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
        }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
      }
    }
  
@@ -3011,10 +3930,24 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                          &MemOpChains[0], MemOpChains.size());
  
-  // Set CR6 to true if this is a vararg call.
-  if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
-    RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
+                         false, false, 0);
+    // R12 must contain the address of an indirect callee.  This does not
+    // mean the MTCTR instruction must use R12; it's easier to model this
+    // as an extra parameter, so do that.
+    RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
    }
  
    // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -3027,8 +3960,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
    }
  
    if (isTailCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
-                    false, TailCallArguments);
+    PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
  
    return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
                      RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
@@ -3045,7 +3978,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                      DebugLoc dl, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &InVals) const {
  
-  unsigned NumOps  = Outs.size();
+  unsigned NumOps = Outs.size();
  
    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    bool isPPC64 = PtrVT == MVT::i64;
@@ -3058,7 +3991,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
    // and restoring the callers stack pointer in this functions epilog. This is
    // done because by tail calling the called function might overwrite the value
    // in this function's (MF) stack pointer stack slot 0(SP).
-  if (GuaranteedTailCallOpt && CallConv==CallingConv::Fast)
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
      MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
  
    unsigned nAltivecParamsAtEnd = 0;
@@ -3107,17 +4041,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
    unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
    unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
  
-  static const unsigned GPR_32[] = {           // 32-bit registers.
+  static const uint16_t GPR_32[] = {           // 32-bit registers.
      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
      PPC::R7, PPC::R8, PPC::R9, PPC::R10,
    };
-  static const unsigned GPR_64[] = {           // 64-bit registers.
+  static const uint16_t GPR_64[] = {           // 64-bit registers.
      PPC::X3, PPC::X4, PPC::X5, PPC::X6,
      PPC::X7, PPC::X8, PPC::X9, PPC::X10,
    };
-  static const unsigned *FPR = GetFPR();
+  static const uint16_t *FPR = GetFPR();
  
-  static const unsigned VR[] = {
+  static const uint16_t VR[] = {
      PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
      PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
    };
@@ -3125,7 +4059,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
    const unsigned NumFPRs = 13;
    const unsigned NumVRs  = array_lengthof(VR);
  
-  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
  
    SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
@@ -3151,11 +4085,13 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
      }
  
      // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
      if (Flags.isByVal()) {
        unsigned Size = Flags.getByValSize();
+      // Very small objects are passed right-justified.  Everything else is
+      // passed left-justified.
        if (Size==1 || Size==2) {
-        // Very small objects are passed right-justified.
-        // Everything else is passed left-justified.
          EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
          if (GPR_idx != NumGPRs) {
            SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
@@ -3166,17 +4102,12 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
  
            ArgOffset += PtrByteSize;
          } else {
-          SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType());
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
            SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
-          SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
-                                CallSeqStart.getNode()->getOperand(0),
-                                Flags, DAG, dl);
-          // This must go outside the CALLSEQ_START..END.
-          SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                               CallSeqStart.getNode()->getOperand(1));
-          DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
-                                 NewCallSeqStart.getNode());
-          Chain = CallSeqStart = NewCallSeqStart;
+          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                            CallSeqStart,
+                                                            Flags, DAG, dl);
            ArgOffset += PtrByteSize;
          }
          continue;
@@ -3184,22 +4115,20 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
        // Copy entire object into memory.  There are cases where gcc-generated
        // code assumes it is there, even if it could be put entirely into
        // registers.  (This is not what the doc says.)
-      SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
-                            CallSeqStart.getNode()->getOperand(0),
-                            Flags, DAG, dl);
-      // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
-      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode());
-      Chain = CallSeqStart = NewCallSeqStart;
-      // And copy the pieces of it that fit into registers.
+      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                        CallSeqStart,
+                                                        Flags, DAG, dl);
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
        for (unsigned j=0; j<Size; j+=PtrByteSize) {
          SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
          if (GPR_idx != NumGPRs) {
            SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                       MachinePointerInfo(),
-                                     false, false, 0);
+                                     false, false, false, 0);
            MemOpChains.push_back(Load.getValue(1));
            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
            ArgOffset += PtrByteSize;
@@ -3237,7 +4166,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
            // Float varargs are always shadowed in available integer registers
            if (GPR_idx != NumGPRs) {
              SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
-                                       MachinePointerInfo(), false, false, 0);
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
              MemOpChains.push_back(Load.getValue(1));
              RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
            }
@@ -3246,7 +4176,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
              PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
              SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
                                         MachinePointerInfo(),
-                                       false, false, 0);
+                                       false, false, false, 0);
              MemOpChains.push_back(Load.getValue(1));
              RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
            }
@@ -3260,11 +4190,10 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
              ++GPR_idx;
          }
-      } else {
+      } else
          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                           isPPC64, isTailCall, false, MemOpChains,
                           TailCallArguments, dl);
-      }
        if (isPPC64)
          ArgOffset += 8;
        else
@@ -3295,7 +4224,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
          if (VR_idx != NumVRs) {
            SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
                                       MachinePointerInfo(),
-                                     false, false, 0);
+                                     false, false, false, 0);
            MemOpChains.push_back(Load.getValue(1));
            RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
          }
@@ -3306,7 +4235,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
            SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
                                    DAG.getConstant(i, PtrVT));
            SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
-                                     false, false, 0);
+                                     false, false, false, 0);
            MemOpChains.push_back(Load.getValue(1));
            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
          }
@@ -3359,22 +4288,6 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                          &MemOpChains[0], MemOpChains.size());
  
-  // Check if this is an indirect call (MTCTR/BCTRL).
-  // See PrepareCall() for more information about calls through function
-  // pointers in the 64-bit SVR4 ABI.
-  if (!isTailCall && isPPC64 && PPCSubTarget.isSVR4ABI() &&
-      !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
-      !isBLACompatibleAddress(Callee, DAG)) {
-    // Load r2 into a virtual register and store it to the TOC save area.
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
-    // TOC save area offset.
-    SDValue PtrOff = DAG.getIntPtrConstant(40);
-    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
-                         false, false, 0);
-  }
-
    // On Darwin, R12 must contain the address of an indirect callee.  This does
    // not mean the MTCTR instruction must use R12; it's easier to model this as
    // an extra parameter, so do that.
@@ -3403,6 +4316,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                      Ins, InVals);
  }
  
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
  SDValue
  PPCTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
@@ -3412,7 +4336,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
  
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
  
    // If this is the first return lowered for this function, add the regs to the
@@ -3428,8 +4352,24 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
    for (unsigned i = 0; i != RVLocs.size(); ++i) {
      CCValAssign &VA = RVLocs[i];
      assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             OutVals[i], Flag);
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
      Flag = Chain.getValue(1);
    }
  
@@ -3459,7 +4399,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
    // Load the old link SP.
    SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr,
                                     MachinePointerInfo(),
-                                   false, false, 0);
+                                   false, false, false, 0);
  
    // Restore the stack pointer.
    Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
@@ -3650,7 +4590,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
      FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
                          DAG.getConstant(4, FIPtr.getValueType()));
    return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MachinePointerInfo(),
-                     false, false, 0);
+                     false, false, false, 0);
  }
  
  SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -3661,7 +4601,52 @@ SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op,
      return SDValue();
  
    if (Op.getOperand(0).getValueType() == MVT::i64) {
-    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(0));
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
      SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits);
      if (Op.getValueType() == MVT::f32)
        FP = DAG.getNode(ISD::FP_ROUND, dl,
@@ -3694,7 +4679,7 @@ SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op,
                              Ops, 4, MVT::i64, MMO);
    // Load the value as a double.
    SDValue Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, MachinePointerInfo(),
-                           false, false, 0);
+                           false, false, false, 0);
  
    // FCFID it and return it.
    SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Ld);
@@ -3746,7 +4731,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
    SDValue Four = DAG.getConstant(4, PtrVT);
    SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
    SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(),
-                            false, false, 0);
+                            false, false, false, 0);
  
    // Transform as necessary
    SDValue CWD1 =
@@ -4006,7 +4991,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
      unsigned TypeShiftAmt = i & (SplatBitSize-1);
  
      // vsplti + shl self.
-    if (SextVal == (i << (int)TypeShiftAmt)) {
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
        SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
        static const unsigned IIDs[] = { // Intrinsic to use for each size.
          Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
@@ -4051,17 +5036,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
      }
  
      // t = vsplti c, result = vsldoi t, t, 1
-    if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
        SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
        return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
      }
      // t = vsplti c, result = vsldoi t, t, 2
-    if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
        SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
        return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
      }
      // t = vsplti c, result = vsldoi t, t, 3
-    if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
        SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
        return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
      }
@@ -4212,8 +5197,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
  
    // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
    // perfect shuffle table to emit an optimal matching sequence.
-  SmallVector<int, 16> PermMask;
-  SVOp->getMask(PermMask);
+  ArrayRef<int> PermMask = SVOp->getMask();
  
    unsigned PFIndexes[4];
    bool isFourElementShuffle = true;
@@ -4417,7 +5401,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                 false, false, 0);
    // Load it out.
    return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(),
-                     false, false, 0);
+                     false, false, false, 0);
  }
  
  SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
@@ -4487,10 +5471,11 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:   llvm_unreachable("TLS not implemented for PPC");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
    case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
    case ISD::VASTART:
      return LowerVASTART(Op, DAG, PPCSubTarget);
  
@@ -4524,7 +5509,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
    }
-  return SDValue();
  }
  
  void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
@@ -4534,8 +5518,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
    DebugLoc dl = N->getDebugLoc();
    switch (N->getOpcode()) {
    default:
-    assert(false && "Do not know how to custom type legalize this operation!");
-    return;
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
    case ISD::VAARG: {
      if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
          || TM.getSubtarget<PPCSubtarget>().isPPC64())
@@ -4828,11 +5811,37 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  
    MachineFunction *F = BB->getParent();
  
-  if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
-      MI->getOpcode() == PPC::SELECT_CC_I8 ||
-      MI->getOpcode() == PPC::SELECT_CC_F4 ||
-      MI->getOpcode() == PPC::SELECT_CC_F8 ||
-      MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+  if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_CC_I8)) {
+    unsigned OpCode = MI->getOpcode() == PPC::SELECT_CC_I8 ?
+                                         PPC::ISEL8 : PPC::ISEL;
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+
+    // The SelectPred is ((BI << 5) | BO) for a BCC
+    unsigned BO = SelectPred & 0xF;
+    assert((BO == 12 || BO == 4) && "invalid predicate BO field for isel");
+
+    unsigned TrueOpNo, FalseOpNo;
+    if (BO == 12) {
+      TrueOpNo = 2;
+      FalseOpNo = 3;
+    } else {
+      TrueOpNo = 3;
+      FalseOpNo = 2;
+      SelectPred = PPC::InvertPredicate((PPC::Predicate)SelectPred);
+    }
+
+    BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(TrueOpNo).getReg())
+      .addReg(MI->getOperand(FalseOpNo).getReg())
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg());
+  } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+             MI->getOpcode() == PPC::SELECT_CC_I8 ||
+             MI->getOpcode() == PPC::SELECT_CC_F4 ||
+             MI->getOpcode() == PPC::SELECT_CC_F8 ||
+             MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+
  
      // The incoming instruction knows the destination vreg to set, the
      // condition code register to branch on, the true/false values to
@@ -5436,12 +6445,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
  //===----------------------------------------------------------------------===//
  
  void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                         APInt &KnownZero,
                                                         APInt &KnownOne,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
    switch (Op.getOpcode()) {
    default: break;
    case PPCISD::LBRX: {
@@ -5487,6 +6495,14 @@ PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
      case 'v':
      case 'y':
        return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
      }
    }
    return TargetLowering::getConstraintType(Constraint);
@@ -5529,6 +6545,9 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
    case 'y':
      weight = CW_Register;
      break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
    }
    return weight;
  }
@@ -5542,18 +6561,18 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
      case 'b':   // R1-R31
      case 'r':   // R0-R31
        if (VT == MVT::i64 && PPCSubTarget.isPPC64())
-        return std::make_pair(0U, PPC::G8RCRegisterClass);
-      return std::make_pair(0U, PPC::GPRCRegisterClass);
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
      case 'f':
-      if (VT == MVT::f32)
-        return std::make_pair(0U, PPC::F4RCRegisterClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, PPC::F8RCRegisterClass);
+      if (VT == MVT::f32 || VT == MVT::i32)
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64 || VT == MVT::i64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
        break;
      case 'v':
-      return std::make_pair(0U, PPC::VRRCRegisterClass);
+      return std::make_pair(0U, &PPC::VRRCRegClass);
      case 'y':   // crrc
-      return std::make_pair(0U, PPC::CRRCRegisterClass);
+      return std::make_pair(0U, &PPC::CRRCRegClass);
      }
    }
  
@@ -5675,7 +6694,7 @@ bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
    return (V > -(1 << 16) && V < (1 << 16)-1);
  }
  
-bool PPCTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+bool PPCTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
    return false;
  }
  
@@ -5704,13 +6723,13 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
      return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                     FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, 0);
+                       MachinePointerInfo(), false, false, false, 0);
    }
  
    // Just load the return address off the stack.
    SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, 0);
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
  }
  
  SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -5724,16 +6743,19 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
    MFI->setFrameAddressIsTaken(true);
-  bool is31 = (DisableFramePointerElim(MF) || MFI->hasVarSizedObjects()) &&
+  bool is31 = (getTargetMachine().Options.DisableFramePointerElim(MF) ||
+               MFI->hasVarSizedObjects()) &&
                    MFI->getStackSize() &&
-                  !MF.getFunction()->hasFnAttr(Attribute::Naked);
+                  !MF.getFunction()->getFnAttributes().
+                    hasAttribute(Attributes::Naked);
    unsigned FrameReg = isPPC64 ? (is31 ? PPC::X31 : PPC::X1) :
                                  (is31 ? PPC::R31 : PPC::R1);
    SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
                                           PtrVT);
    while (Depth--)
      FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
-                            FrameAddr, MachinePointerInfo(), false, false, 0);
+                            FrameAddr, MachinePointerInfo(), false, false,
+                            false, 0);
    return FrameAddr;
  }
  
@@ -5749,7 +6771,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
  /// means there isn't a need to check it against alignment requirement,
  /// probably because the source does not need to be loaded. If
-/// 'NonScalarIntSafe' is true, that means it's safe to return a
+/// 'IsZeroVal' is true, that means it's safe to return a
  /// non-scalar-integer type, e.g. empty string source, constant, or loaded
  /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
  /// constant so it does not need to be loaded.
@@ -5757,7 +6779,7 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  /// target-independent logic.
  EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
                                             unsigned DstAlign, unsigned SrcAlign,
-                                           bool NonScalarIntSafe,
+                                           bool IsZeroVal,
                                             bool MemcpyStrSrc,
                                             MachineFunction &MF) const {
    if (this->PPCSubTarget.isPPC64()) {
@@ -5766,3 +6788,31 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
      return MVT::i32;
    }
  }
+
+/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+/// is expanded to mul + add.
+bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::v4f32:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+  if (DisableILPPref)
+    return TargetLowering::getSchedulingPreference(N);
+
+  return Sched::ILP;
+}
+