Move dllimport name mangling to IR mangler.

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index ca6b614ce2ea7a3b489c3c6df8de77e46d42615c..28069cea5c080cb2cba44893a943bddb1bd7435c 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -23,6 +23,7 @@
  #include "MCTargetDesc/ARMAddressingModes.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
  #include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -40,6 +41,7 @@
  #include "llvm/IR/IRBuilder.h"
  #include "llvm/IR/Instruction.h"
  #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/Intrinsics.h"
  #include "llvm/IR/Type.h"
  #include "llvm/MC/MCSectionMachO.h"
@@ -47,6 +49,7 @@
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetOptions.h"
  #include <utility>
  using namespace llvm;
@@ -156,26 +159,18 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
    addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
  }
  
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO())
-    return new TargetLoweringObjectFileMachO();
-  if (TT.isOSWindows())
-    return new TargetLoweringObjectFileCOFF();
-  return new ARMElfTargetObjectFile();
-}
-
-ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
-  Subtarget = &TM.getSubtarget<ARMSubtarget>();
-  RegInfo = TM.getSubtargetImpl()->getRegisterInfo();
-  Itins = TM.getSubtargetImpl()->getInstrItineraryData();
+ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
+                                     const ARMSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  RegInfo = Subtarget->getRegisterInfo();
+  Itins = Subtarget->getInstrItineraryData();
  
    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  
    if (Subtarget->isTargetMachO()) {
      // Uses VFP for Thumb libfuncs if available.
      if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
-        Subtarget->hasARMOps() && !TM.Options.UseSoftFloat) {
+        Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
        // Single-precision floating-point arithmetic.
        setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
        setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
@@ -406,33 +401,34 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
    else
      addRegisterClass(MVT::i32, &ARM::GPRRegClass);
-  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
        !Subtarget->isThumb1Only()) {
      addRegisterClass(MVT::f32, &ARM::SPRRegClass);
      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
    }
  
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-      setTruncStoreAction((MVT::SimpleValueType)VT,
-                          (MVT::SimpleValueType)InnerVT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(VT, InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+    }
  
-    setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
  
-    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
    }
  
    setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
    setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
  
+  setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
+  setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+
    if (Subtarget->hasNEON()) {
      addDRTypeForNEON(MVT::v2f32);
      addDRTypeForNEON(MVT::v8i8);
@@ -575,15 +571,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setTargetDAGCombine(ISD::FP_TO_SINT);
      setTargetDAGCombine(ISD::FP_TO_UINT);
      setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::LOAD);
  
      // It is legal to extload from v4i8 to v4i16 or v4i32.
-    MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
-                  MVT::v4i16, MVT::v2i16,
-                  MVT::v2i32};
-    for (unsigned i = 0; i < 6; ++i) {
-      setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
-      setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
+    for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
+                   MVT::v2i32}) {
+      for (MVT VT : MVT::integer_vector_valuetypes()) {
+        setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
+        setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
+      }
      }
    }
  
@@ -621,15 +618,23 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FRINT,      MVT::f64, Expand);
      setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
      setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
      setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
      setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
    }
  
-  computeRegisterProperties();
+  computeRegisterProperties(Subtarget->getRegisterInfo());
  
    // ARM does not have floating-point extending loads.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+  }
  
    // ... or truncating stores
    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -637,7 +642,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  
    // ARM does not have i1 sign extending load.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  
    // ARM supports all 4 flavors of integer indexed load / store.
    if (!Subtarget->isThumb1Only()) {
@@ -817,7 +823,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    }
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  
-  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
        !Subtarget->isThumb1Only()) {
      // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
      // iff target supports vfp2.
@@ -858,7 +864,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
    setOperationAction(ISD::FREM,      MVT::f64, Expand);
    setOperationAction(ISD::FREM,      MVT::f32, Expand);
-  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
        !Subtarget->isThumb1Only()) {
      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
      setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
@@ -872,17 +878,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    }
  
    // Various VFP goodness
-  if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
-    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
-    if (Subtarget->hasVFP2()) {
-      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
-      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
-      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
-    }
-
-    // v8 adds f64 <-> f16 conversion. Before that it should be expanded.
-    if (!Subtarget->hasV8Ops()) {
+  if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
+    // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
+    if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
        setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
        setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
      }
@@ -898,7 +896,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    if (Subtarget->hasSinCos()) {
      setLibcallName(RTLIB::SINCOS_F32, "sincosf");
      setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+    if (Subtarget->getTargetTriple().isiOS()) {
        // For iOS, we don't want to the normal expansion of a libcall to
        // sincos. We want to issue a libcall to __sincos_stret.
        setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
@@ -906,16 +904,21 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      }
    }
  
-  // ARMv8 implements a lot of rounding-like FP operations.
-  if (Subtarget->hasV8Ops()) {
-    static MVT RoundingTypes[] = {MVT::f32, MVT::f64};
-    for (const auto Ty : RoundingTypes) {
-      setOperationAction(ISD::FFLOOR, Ty, Legal);
-      setOperationAction(ISD::FCEIL, Ty, Legal);
-      setOperationAction(ISD::FROUND, Ty, Legal);
-      setOperationAction(ISD::FTRUNC, Ty, Legal);
-      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-      setOperationAction(ISD::FRINT, Ty, Legal);
+  // FP-ARMv8 implements a lot of rounding-like FP operations.
+  if (Subtarget->hasFPARMv8()) {
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::f32, Legal);
+    if (!Subtarget->isFPOnlySP()) {
+      setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::f64, Legal);
      }
    }
    // We have target-specific dag combine patterns for the following nodes:
@@ -932,7 +935,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  
    setStackPointerRegisterToSaveRestore(ARM::SP);
  
-  if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
+  if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
        !Subtarget->hasVFP2())
      setSchedulingPreference(Sched::RegPressure);
    else
@@ -956,6 +959,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
  }
  
+bool ARMTargetLowering::useSoftFloat() const {
+  return Subtarget->useSoftFloat();
+}
+
  // FIXME: It might make sense to define the representative register class as the
  // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
  // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -966,13 +973,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  // of the difficulty prior to coalescing of modeling operand register classes
  // due to the common occurrence of cross class copies and subregister insertions
  // and extractions.
-std::pair<const TargetRegisterClass*, uint8_t>
-ARMTargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
    const TargetRegisterClass *RRC = nullptr;
    uint8_t Cost = 1;
    switch (VT.SimpleTy) {
    default:
-    return TargetLowering::findRepresentativeClass(VT);
+    return TargetLowering::findRepresentativeClass(TRI, VT);
    // Use DPR as representative register class for all floating point
    // and vector types. Since there are 32 SPR registers and 32 DPR registers so
    // the cost is 1 for both f32 and f64.
@@ -1004,11 +1012,12 @@ ARMTargetLowering::findRepresentativeClass(MVT VT) const{
  }
  
  const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  default: return nullptr;
+  switch ((ARMISD::NodeType)Opcode) {
+  case ARMISD::FIRST_NUMBER:  break;
    case ARMISD::Wrapper:       return "ARMISD::Wrapper";
    case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
    case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
    case ARMISD::CALL:          return "ARMISD::CALL";
    case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
    case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
@@ -1031,11 +1040,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  
    case ARMISD::RBIT:          return "ARMISD::RBIT";
  
-  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
-  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
-  case ARMISD::SITOF:         return "ARMISD::SITOF";
-  case ARMISD::UITOF:         return "ARMISD::UITOF";
-
    case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
    case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
    case ARMISD::RRX:           return "ARMISD::RRX";
@@ -1090,6 +1094,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
    case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
    case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
+  case ARMISD::VSLI:          return "ARMISD::VSLI";
+  case ARMISD::VSRI:          return "ARMISD::VSRI";
    case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
    case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
    case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
@@ -1140,6 +1146,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
    case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
    }
+  return nullptr;
  }
  
  EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
@@ -1162,6 +1169,20 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
    return TargetLowering::getRegClassFor(VT);
  }
  
+// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
+// source/dest is aligned and the copy size is large enough. We therefore want
+// to align such objects passed to memory intrinsics.
+bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+                                               unsigned &PrefAlign) const {
+  if (!isa<MemIntrinsic>(CI))
+    return false;
+  MinSize = 8;
+  // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
+  // cycle faster than 4-byte aligned LDM.
+  PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
+  return true;
+}
+
  // Create a fast isel object.
  FastISel *
  ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
@@ -1169,12 +1190,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
    return ARM::createFastISel(funcInfo, libInfo);
  }
  
-/// getMaximalGlobalOffset - Returns the maximal possible offset which can
-/// be used for loads / stores from the global.
-unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
-  return (Subtarget->isThumb1Only() ? 127 : 4095);
-}
-
  Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
    unsigned NumVals = N->getNumValues();
    if (!NumVals)
@@ -1193,8 +1208,7 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
  
    // Load are scheduled for latency even if there instruction itinerary
    // is not available.
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
  
    if (MCID.getNumDefs() == 0)
@@ -1369,7 +1383,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
        if (VA.getLocVT() == MVT::v2f64) {
          SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
          Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
-                          DAG.getConstant(0, MVT::i32));
+                          DAG.getConstant(0, dl, MVT::i32));
  
          VA = RVLocs[++i]; // skip ahead to next loc
          Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
@@ -1383,7 +1397,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
            std::swap (Lo, Hi);
          Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
          Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
-                          DAG.getConstant(1, MVT::i32));
+                          DAG.getConstant(1, dl, MVT::i32));
        }
      } else {
        Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
@@ -1414,7 +1428,7 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                                      const CCValAssign &VA,
                                      ISD::ArgFlagsTy Flags) const {
    unsigned LocMemOffset = VA.getLocMemOffset();
-  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
    PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
    return DAG.getStore(Chain, dl, Arg, PtrOff,
                        MachinePointerInfo::getStack(LocMemOffset),
@@ -1454,7 +1468,7 @@ SDValue
  ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
    SelectionDAG &DAG                     = CLI.DAG;
-  SDLoc &dl                          = CLI.DL;
+  SDLoc &dl                             = CLI.DL;
    SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
    SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
    SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
@@ -1469,9 +1483,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
    bool isThisReturn   = false;
    bool isSibCall      = false;
+  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
  
    // Disable tail calls if they're not supported.
-  if (!Subtarget->supportsTailCall() || MF.getTarget().Options.DisableTailCalls)
+  if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
      isTailCall = false;
  
    if (isTailCall) {
@@ -1508,8 +1523,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    // Adjust the stack pointer for the new arguments...
    // These operations are automatically eliminated by the prolog/epilog pass
    if (!isSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                                 dl);
+    Chain = DAG.getCALLSEQ_START(Chain,
+                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
  
    SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
  
@@ -1548,9 +1563,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      if (VA.needsCustom()) {
        if (VA.getLocVT() == MVT::v2f64) {
          SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                  DAG.getConstant(0, MVT::i32));
+                                  DAG.getConstant(0, dl, MVT::i32));
          SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                  DAG.getConstant(1, MVT::i32));
+                                  DAG.getConstant(1, dl, MVT::i32));
  
          PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
@@ -1585,7 +1600,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        // True if this byval aggregate will be split between registers
        // and memory.
        unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
-      unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
+      unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
  
        if (CurByValIdx < ByValArgsCount) {
  
@@ -1595,7 +1610,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
          EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
          unsigned int i, j;
          for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
-          SDValue Const = DAG.getConstant(4*i, MVT::i32);
+          SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
            SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
            SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                       MachinePointerInfo(),
@@ -1614,14 +1629,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
        if (Flags.getByValSize() > 4*offset) {
          unsigned LocMemOffset = VA.getLocMemOffset();
-        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
          SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
                                    StkPtrOff);
-        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
          SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
-        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
                                             MVT::i32);
-        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
+                                            MVT::i32);
  
          SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
          SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
@@ -1735,11 +1751,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      } else if (Subtarget->isTargetCOFF()) {
        assert(Subtarget->isTargetWindows() &&
               "Windows is the only supported COFF target");
-      unsigned TargetFlags = GV->hasDLLImportStorageClass()
-                                 ? ARMII::MO_DLLIMPORT
-                                 : ARMII::MO_NO_FLAG;
        Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0,
-                                          TargetFlags);
+                                          ARMII::MO_NO_FLAG);
        if (GV->hasDLLImportStorageClass())
          Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                               DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(),
@@ -1771,7 +1784,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                             DAG.getEntryNode(), CPAddr,
                             MachinePointerInfo::getConstantPool(),
                             false, false, false, 0);
-      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
        Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
                             getPointerTy(), Callee, PICLabel);
      } else {
@@ -1786,8 +1799,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // FIXME: handle tail calls differently.
    unsigned CallOpc;
-  bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
+  bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
    if (Subtarget->isThumb()) {
      if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
        CallOpc = ARMISD::CALL_NOLINK;
@@ -1818,21 +1830,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    // Add a register mask operand representing the call-preserved registers.
    if (!isTailCall) {
      const uint32_t *Mask;
-    const TargetRegisterInfo *TRI =
-        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-    const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
      if (isThisReturn) {
        // For 'this' returns, use the R0-preserving mask if applicable
-      Mask = ARI->getThisReturnPreservedMask(CallConv);
+      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
        if (!Mask) {
          // Set isThisReturn to false if the calling convention is not one that
          // allows 'returned' to be modeled in this way, so LowerCallResult does
          // not try to pass 'this' straight through
          isThisReturn = false;
-        Mask = ARI->getCallPreservedMask(CallConv);
+        Mask = ARI->getCallPreservedMask(MF, CallConv);
        }
      } else
-      Mask = ARI->getCallPreservedMask(CallConv);
+      Mask = ARI->getCallPreservedMask(MF, CallConv);
  
      assert(Mask && "Missing call preserved mask for calling convention");
      Ops.push_back(DAG.getRegisterMask(Mask));
@@ -1842,15 +1852,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      Ops.push_back(InFlag);
  
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  if (isTailCall)
+  if (isTailCall) {
+    MF.getFrameInfo()->setHasTailCall();
      return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+  }
  
    // Returns a chain and a flag for retval copy to use.
    Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
    InFlag = Chain.getValue(1);
  
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag, dl);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
    if (!Ins.empty())
      InFlag = Chain.getValue(1);
  
@@ -1865,58 +1877,58 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  /// on the stack.  Remember the next parameter register to allocate,
  /// and then confiscate the rest of the parameter registers to insure
  /// this.
-void
-ARMTargetLowering::HandleByVal(
-    CCState *State, unsigned &size, unsigned Align) const {
-  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
+void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
+                                    unsigned Align) const {
    assert((State->getCallOrPrologue() == Prologue ||
            State->getCallOrPrologue() == Call) &&
           "unhandled ParmContext");
  
-  if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
-    if (Subtarget->isAAPCS_ABI() && Align > 4) {
-      unsigned AlignInRegs = Align / 4;
-      unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
-      for (unsigned i = 0; i < Waste; ++i)
-        reg = State->AllocateReg(GPRArgRegs, 4);
-    }
-    if (reg != 0) {
-      unsigned excess = 4 * (ARM::R4 - reg);
-
-      // Special case when NSAA != SP and parameter size greater than size of
-      // all remained GPR regs. In that case we can't split parameter, we must
-      // send it to stack. We also must set NCRN to R4, so waste all
-      // remained registers.
-      const unsigned NSAAOffset = State->getNextStackOffset();
-      if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
-        while (State->AllocateReg(GPRArgRegs, 4))
-          ;
-        return;
-      }
+  // Byval (as with any stack) slots are always at least 4 byte aligned.
+  Align = std::max(Align, 4U);
  
-      // First register for byval parameter is the first register that wasn't
-      // allocated before this method call, so it would be "reg".
-      // If parameter is small enough to be saved in range [reg, r4), then
-      // the end (first after last) register would be reg + param-size-in-regs,
-      // else parameter would be splitted between registers and stack,
-      // end register would be r4 in this case.
-      unsigned ByValRegBegin = reg;
-      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
-      State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
-      // Note, first register is allocated in the beginning of function already,
-      // allocate remained amount of registers we need.
-      for (unsigned i = reg+1; i != ByValRegEnd; ++i)
-        State->AllocateReg(GPRArgRegs, 4);
-      // A byval parameter that is split between registers and memory needs its
-      // size truncated here.
-      // In the case where the entire structure fits in registers, we set the
-      // size in memory to zero.
-      if (size < excess)
-        size = 0;
-      else
-        size -= excess;
-    }
+  unsigned Reg = State->AllocateReg(GPRArgRegs);
+  if (!Reg)
+    return;
+
+  unsigned AlignInRegs = Align / 4;
+  unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
+  for (unsigned i = 0; i < Waste; ++i)
+    Reg = State->AllocateReg(GPRArgRegs);
+
+  if (!Reg)
+    return;
+
+  unsigned Excess = 4 * (ARM::R4 - Reg);
+
+  // Special case when NSAA != SP and parameter size greater than size of
+  // all remained GPR regs. In that case we can't split parameter, we must
+  // send it to stack. We also must set NCRN to R4, so waste all
+  // remained registers.
+  const unsigned NSAAOffset = State->getNextStackOffset();
+  if (NSAAOffset != 0 && Size > Excess) {
+    while (State->AllocateReg(GPRArgRegs))
+      ;
+    return;
    }
+
+  // First register for byval parameter is the first register that wasn't
+  // allocated before this method call, so it would be "reg".
+  // If parameter is small enough to be saved in range [reg, r4), then
+  // the end (first after last) register would be reg + param-size-in-regs,
+  // else parameter would be splitted between registers and stack,
+  // end register would be r4 in this case.
+  unsigned ByValRegBegin = Reg;
+  unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
+  State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
+  // Note, first register is allocated in the beginning of function already,
+  // allocate remained amount of registers we need.
+  for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
+    State->AllocateReg(GPRArgRegs);
+  // A byval parameter that is split between registers and memory needs its
+  // size truncated here.
+  // In the case where the entire structure fits in registers, we set the
+  // size in memory to zero.
+  Size = std::max<int>(Size - Excess, 0);
  }
  
  /// MatchingStackOffset - Return true if the given stack call argument is
@@ -1999,7 +2011,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (isCalleeStructRet || isCallerStructRet)
      return false;
  
-  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
+  // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
    // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
    // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
    // support in the assembler and linker to be used. This would need to be
@@ -2028,7 +2040,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    // cannot rely on the linker replacing the tail call with a return.
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
      const GlobalValue *GV = G->getGlobal();
-    if (GV->hasExternalWeakLinkage())
+    const Triple TT(getTargetMachine().getTargetTriple());
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
        return false;
    }
  
@@ -2087,8 +2101,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        // the caller's fixed stack objects.
        MachineFrameInfo *MFI = MF.getFrameInfo();
        const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const TargetInstrInfo *TII =
-          getTargetMachine().getSubtargetImpl()->getInstrInfo();
+      const TargetInstrInfo *TII = Subtarget->getInstrInfo();
        for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
             i != e;
             ++i, ++realArgIdx) {
@@ -2163,7 +2176,8 @@ static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
      report_fatal_error("Unsupported interrupt attribute. If present, value "
                         "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
  
-  RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
+  RetOps.insert(RetOps.begin() + 1,
+                DAG.getConstant(LROffset, DL, MVT::i32, false));
  
    return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
  }
@@ -2216,7 +2230,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
        if (VA.getLocVT() == MVT::v2f64) {
          // Extract the first half and return it in two registers.
          SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                   DAG.getConstant(0, MVT::i32));
+                                   DAG.getConstant(0, dl, MVT::i32));
          SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                         DAG.getVTList(MVT::i32, MVT::i32), Half);
  
@@ -2235,7 +2249,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
  
          // Extract the 2nd half and fall through to handle it as an f64 value.
          Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                          DAG.getConstant(1, MVT::i32));
+                          DAG.getConstant(1, dl, MVT::i32));
        }
        // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
        // available.
@@ -2359,12 +2373,32 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
    if (!Subtarget->supportsTailCall())
      return false;
  
-  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
+  auto Attr =
+      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
      return false;
  
    return !Subtarget->isThumb1Only();
  }
  
+// Trying to write a 64 bit value so need to split into two 32 bit values first,
+// and pass the lower and high parts through.
+static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue WriteValue = Op->getOperand(2);
+
+  // This function is only supposed to be called for i64 type argument.
+  assert(WriteValue.getValueType() == MVT::i64
+          && "LowerWRITE_REGISTER called for non-i64 type argument.");
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+                           DAG.getConstant(1, DL, MVT::i32));
+  SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
+  return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
+}
+
  // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
  // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
  // one of the above mentioned nodes. It has to be wrapped because otherwise
@@ -2416,7 +2450,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
                                 false, false, false, 0);
    if (RelocM == Reloc::Static)
      return Result;
-  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
    return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
  }
  
@@ -2440,7 +2474,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                           false, false, false, 0);
    SDValue Chain = Argument.getValue(1);
  
-  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
    Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
  
    // call __tls_get_addr.
@@ -2492,7 +2526,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                           false, false, false, 0);
      Chain = Offset.getValue(1);
  
-    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
      Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
  
      Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -2609,8 +2643,6 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
           "Windows on ARM expects to use movw/movt");
  
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  const ARMII::TOF TargetFlags =
-    (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
    EVT PtrVT = getPointerTy();
    SDValue Result;
    SDLoc DL(Op);
@@ -2621,7 +2653,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
    // operands, expand this into two nodes.
    Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
                         DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
-                                                  TargetFlags));
+                                                  ARMII::MO_NO_FLAG));
    if (GV->hasDLLImportStorageClass())
      Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                           MachinePointerInfo::getGOT(), false, false, false, 0);
@@ -2646,14 +2678,14 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
                                 MachinePointerInfo::getConstantPool(),
                                 false, false, false, 0);
-  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
    return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
  }
  
  SDValue
  ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl(Op);
-  SDValue Val = DAG.getConstant(0, MVT::i32);
+  SDValue Val = DAG.getConstant(0, dl, MVT::i32);
    return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
                       DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
                       Op.getOperand(1), Val);
@@ -2663,7 +2695,7 @@ SDValue
  ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl(Op);
    return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
-                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
+                     Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
  }
  
  SDValue
@@ -2702,7 +2734,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                    false, false, false, 0);
  
      if (RelocM == Reloc::PIC_) {
-      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
        Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
      }
      return Result;
@@ -2728,7 +2760,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
      assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
             "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
      return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
-                       DAG.getConstant(0, MVT::i32));
+                       DAG.getConstant(0, dl, MVT::i32));
    }
  
    ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
@@ -2745,8 +2777,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
    }
  
    return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(Intrinsic::arm_dmb, MVT::i32),
-                     DAG.getConstant(Domain, MVT::i32));
+                     DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
+                     DAG.getConstant(Domain, dl, MVT::i32));
  }
  
  static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
@@ -2772,8 +2804,8 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
    }
  
    return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
-                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
-                     DAG.getConstant(isData, MVT::i32));
+                     Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
+                     DAG.getConstant(isData, dl, MVT::i32));
  }
  
  static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -2826,55 +2858,6 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
  }
  
-void
-ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
-                                  unsigned InRegsParamRecordIdx,
-                                  unsigned ArgSize,
-                                  unsigned &ArgRegsSize,
-                                  unsigned &ArgRegsSaveSize)
-  const {
-  unsigned NumGPRs;
-  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
-    unsigned RBegin, REnd;
-    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
-    NumGPRs = REnd - RBegin;
-  } else {
-    unsigned int firstUnalloced;
-    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
-                                                sizeof(GPRArgRegs) /
-                                                sizeof(GPRArgRegs[0]));
-    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
-  }
-
-  unsigned Align = MF.getTarget()
-                       .getSubtargetImpl()
-                       ->getFrameLowering()
-                       ->getStackAlignment();
-  ArgRegsSize = NumGPRs * 4;
-
-  // If parameter is split between stack and GPRs...
-  if (NumGPRs && Align > 4 &&
-      (ArgRegsSize < ArgSize ||
-        InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
-    // Add padding for part of param recovered from GPRs.  For example,
-    // if Align == 8, its last byte must be at address K*8 - 1.
-    // We need to do it, since remained (stack) part of parameter has
-    // stack alignment, and we need to "attach" "GPRs head" without gaps
-    // to it:
-    // Stack:
-    // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
-    // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
-    //
-    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-    unsigned Padding =
-        OffsetToAlignment(ArgRegsSize + AFI->getArgRegsSaveSize(), Align);
-    ArgRegsSaveSize = ArgRegsSize + Padding;
-  } else
-    // We don't need to extend regs save size for byval parameters if they
-    // are passed via GPRs only.
-    ArgRegsSaveSize = ArgRegsSize;
-}
-
  // The remaining GPRs hold either the beginning of variable-argument
  // data, or the beginning of an aggregate passed by value (usually
  // byval).  Either way, we allocate stack slots adjacent to the data
@@ -2888,13 +2871,8 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
                                    SDLoc dl, SDValue &Chain,
                                    const Value *OrigArg,
                                    unsigned InRegsParamRecordIdx,
-                                  unsigned OffsetFromOrigArg,
-                                  unsigned ArgOffset,
-                                  unsigned ArgSize,
-                                  bool ForceMutable,
-                                  unsigned ByValStoreOffset,
-                                  unsigned TotalArgRegsSaveSize) const {
-
+                                  int ArgOffset,
+                                  unsigned ArgSize) const {
    // Currently, two use-cases possible:
    // Case #1. Non-var-args function, and we meet first byval parameter.
    //          Setup first unallocated register as first byval register;
@@ -2909,83 +2887,39 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned firstRegToSaveIndex, lastRegToSaveIndex;
    unsigned RBegin, REnd;
    if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
      CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
-    firstRegToSaveIndex = RBegin - ARM::R0;
-    lastRegToSaveIndex = REnd - ARM::R0;
    } else {
-    firstRegToSaveIndex = CCInfo.getFirstUnallocated
-      (GPRArgRegs, array_lengthof(GPRArgRegs));
-    lastRegToSaveIndex = 4;
-  }
-
-  unsigned ArgRegsSize, ArgRegsSaveSize;
-  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
-                 ArgRegsSize, ArgRegsSaveSize);
-
-  // Store any by-val regs to their spots on the stack so that they may be
-  // loaded by deferencing the result of formal parameter pointer or va_next.
-  // Note: once stack area for byval/varargs registers
-  // was initialized, it can't be initialized again.
-  if (ArgRegsSaveSize) {
-    unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
-
-    if (Padding) {
-      assert(AFI->getStoredByValParamsPadding() == 0 &&
-             "The only parameter may be padded.");
-      AFI->setStoredByValParamsPadding(Padding);
-    }
-
-    int FrameIndex = MFI->CreateFixedObject(ArgRegsSaveSize,
-                                            Padding +
-                                              ByValStoreOffset -
-                                              (int64_t)TotalArgRegsSaveSize,
-                                            false);
-    SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
-    if (Padding) {
-       MFI->CreateFixedObject(Padding,
-                              ArgOffset + ByValStoreOffset -
-                                (int64_t)ArgRegsSaveSize,
-                              false);
-    }
-
-    SmallVector<SDValue, 4> MemOps;
-    for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
-         ++firstRegToSaveIndex, ++i) {
-      const TargetRegisterClass *RC;
-      if (AFI->isThumb1OnlyFunction())
-        RC = &ARM::tGPRRegClass;
-      else
-        RC = &ARM::GPRRegClass;
+    unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
+    RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
+    REnd = ARM::R4;
+  }
  
-      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
-      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-      SDValue Store =
-        DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                     MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
-                     false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                        DAG.getConstant(4, getPointerTy()));
-    }
+  if (REnd != RBegin)
+    ArgOffset = -4 * (ARM::R4 - RBegin);
  
-    AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
+  int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false);
+  SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
  
-    if (!MemOps.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-    return FrameIndex;
-  } else {
-    if (ArgSize == 0) {
-      // We cannot allocate a zero-byte object for the first variadic argument,
-      // so just make up a size.
-      ArgSize = 4;
-    }
-    // This will point to the next argument passed via stack.
-    return MFI->CreateFixedObject(
-      ArgSize, ArgOffset, !ForceMutable);
+  SmallVector<SDValue, 4> MemOps;
+  const TargetRegisterClass *RC =
+      AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+
+  for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
+    unsigned VReg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+    SDValue Store =
+        DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                     MachinePointerInfo(OrigArg, 4 * i), false, false, 0);
+    MemOps.push_back(Store);
+    FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                      DAG.getConstant(4, dl, getPointerTy()));
    }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  return FrameIndex;
  }
  
  // Setup stack frame, the va_list pointer will start from.
@@ -3003,11 +2937,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
    // the result of va_next.
    // If there is no regs to be stored, just point address after last
    // argument passed via stack.
-  int FrameIndex =
-    StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
-                   CCInfo.getInRegsParamsCount(), 0, ArgOffset, 0, ForceMutable,
-                   0, TotalArgRegsSaveSize);
-
+  int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
+                                  CCInfo.getInRegsParamsCount(),
+                                  CCInfo.getNextStackOffset(), 4);
    AFI->setVarArgsFrameIndex(FrameIndex);
  }
  
@@ -3033,7 +2965,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                                    isVarArg));
  
    SmallVector<SDValue, 16> ArgValues;
-  int lastInsIndex = -1;
    SDValue ArgValue;
    Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
    unsigned CurArgIdx = 0;
@@ -3043,55 +2974,48 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
    // We also increase this value in case of varargs function.
    AFI->setArgRegsSaveSize(0);
  
-  unsigned ByValStoreOffset = 0;
-  unsigned TotalArgRegsSaveSize = 0;
-  unsigned ArgRegsSaveSizeMaxAlign = 4;
-
    // Calculate the amount of stack space that we need to allocate to store
    // byval and variadic arguments that are passed in registers.
    // We need to know this before we allocate the first byval or variadic
    // argument, as they will be allocated a stack slot below the CFA (Canonical
    // Frame Address, the stack pointer at entry to the function).
+  unsigned ArgRegBegin = ARM::R4;
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
+      break;
+
      CCValAssign &VA = ArgLocs[i];
-    if (VA.isMemLoc()) {
-      int index = VA.getValNo();
-      if (index != lastInsIndex) {
-        ISD::ArgFlagsTy Flags = Ins[index].Flags;
-        if (Flags.isByVal()) {
-          unsigned ExtraArgRegsSize;
-          unsigned ExtraArgRegsSaveSize;
-          computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProceed(),
-                         Flags.getByValSize(),
-                         ExtraArgRegsSize, ExtraArgRegsSaveSize);
-
-          TotalArgRegsSaveSize += ExtraArgRegsSaveSize;
-          if (Flags.getByValAlign() > ArgRegsSaveSizeMaxAlign)
-              ArgRegsSaveSizeMaxAlign = Flags.getByValAlign();
-          CCInfo.nextInRegsParam();
-        }
-        lastInsIndex = index;
-      }
-    }
+    unsigned Index = VA.getValNo();
+    ISD::ArgFlagsTy Flags = Ins[Index].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    assert(VA.isMemLoc() && "unexpected byval pointer in reg");
+    unsigned RBegin, REnd;
+    CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
+    ArgRegBegin = std::min(ArgRegBegin, RBegin);
+
+    CCInfo.nextInRegsParam();
    }
    CCInfo.rewindByValRegsInfo();
-  lastInsIndex = -1;
+
+  int lastInsIndex = -1;
    if (isVarArg && MFI->hasVAStart()) {
-    unsigned ExtraArgRegsSize;
-    unsigned ExtraArgRegsSaveSize;
-    computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0,
-                   ExtraArgRegsSize, ExtraArgRegsSaveSize);
-    TotalArgRegsSaveSize += ExtraArgRegsSaveSize;
+    unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
+    if (RegIdx != array_lengthof(GPRArgRegs))
+      ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
    }
-  // If the arg regs save area contains N-byte aligned values, the
-  // bottom of it must be at least N-byte aligned.
-  TotalArgRegsSaveSize = RoundUpToAlignment(TotalArgRegsSaveSize, ArgRegsSaveSizeMaxAlign);
-  TotalArgRegsSaveSize = std::min(TotalArgRegsSaveSize, 16U);
+
+  unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
+  AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
  
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
-    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
+    if (Ins[VA.getValNo()].isOrigArg()) {
+      std::advance(CurOrigArg,
+                   Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
+    }
      // Arguments stored in registers.
      if (VA.isRegLoc()) {
        EVT RegVT = VA.getLocVT();
@@ -3116,9 +3040,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
            }
            ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
            ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
-                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
+                                 ArgValue, ArgValue1,
+                                 DAG.getIntPtrConstant(0, dl));
            ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
-                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
+                                 ArgValue, ArgValue2,
+                                 DAG.getIntPtrConstant(1, dl));
          } else
            ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
  
@@ -3132,9 +3058,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
          else if (RegVT == MVT::v2f64)
            RC = &ARM::QPRRegClass;
          else if (RegVT == MVT::i32)
-          RC = AFI->isThumb1OnlyFunction() ?
-            (const TargetRegisterClass*)&ARM::tGPRRegClass :
-            (const TargetRegisterClass*)&ARM::GPRRegClass;
+          RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
+                                           : &ARM::GPRRegClass;
          else
            llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
  
@@ -3172,7 +3097,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
        assert(VA.isMemLoc());
        assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
  
-      int index = ArgLocs[i].getValNo();
+      int index = VA.getValNo();
  
        // Some Ins[] entries become multiple ArgLoc[] entries.
        // Process them only once.
@@ -3185,20 +3110,13 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
            // Since they could be overwritten by lowering of arguments in case of
            // a tail call.
            if (Flags.isByVal()) {
-            unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
-
-            ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign());
-            int FrameIndex = StoreByValRegs(
-                CCInfo, DAG, dl, Chain, CurOrigArg,
-                CurByValIndex,
-                Ins[VA.getValNo()].PartOffset,
-                VA.getLocMemOffset(),
-                Flags.getByValSize(),
-                true /*force mutable frames*/,
-                ByValStoreOffset,
-                TotalArgRegsSaveSize);
-            ByValStoreOffset += Flags.getByValSize();
-            ByValStoreOffset = std::min(ByValStoreOffset, 16U);
+            assert(Ins[index].isOrigArg() &&
+                   "Byval arguments cannot be implicit");
+            unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
+
+            int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg,
+                                            CurByValIndex, VA.getLocMemOffset(),
+                                            Flags.getByValSize());
              InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
              CCInfo.nextInRegsParam();
            } else {
@@ -3240,6 +3158,18 @@ static bool isFloatingPointZero(SDValue Op) {
          if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
            return CFP->getValueAPF().isPosZero();
      }
+  } else if (Op->getOpcode() == ISD::BITCAST &&
+             Op->getValueType(0) == MVT::f64) {
+    // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
+    // created by LowerConstantFP().
+    SDValue BitcastOp = Op->getOperand(0);
+    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) {
+      SDValue MoveOp = BitcastOp->getOperand(0);
+      if (MoveOp->getOpcode() == ISD::TargetConstant &&
+          cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) {
+        return true;
+      }
+    }
    }
    return false;
  }
@@ -3260,28 +3190,28 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
        case ISD::SETGE:
          if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
            CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          RHS = DAG.getConstant(C-1, MVT::i32);
+          RHS = DAG.getConstant(C - 1, dl, MVT::i32);
          }
          break;
        case ISD::SETULT:
        case ISD::SETUGE:
          if (C != 0 && isLegalICmpImmediate(C-1)) {
            CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          RHS = DAG.getConstant(C-1, MVT::i32);
+          RHS = DAG.getConstant(C - 1, dl, MVT::i32);
          }
          break;
        case ISD::SETLE:
        case ISD::SETGT:
          if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
            CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          RHS = DAG.getConstant(C+1, MVT::i32);
+          RHS = DAG.getConstant(C + 1, dl, MVT::i32);
          }
          break;
        case ISD::SETULE:
        case ISD::SETUGT:
          if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
            CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          RHS = DAG.getConstant(C+1, MVT::i32);
+          RHS = DAG.getConstant(C + 1, dl, MVT::i32);
          }
          break;
        }
@@ -3300,7 +3230,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
      CompareType = ARMISD::CMPZ;
      break;
    }
-  ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
    return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
  }
  
@@ -3346,7 +3276,7 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
    SDValue Value, OverflowCmp;
    SDValue LHS = Op.getOperand(0);
    SDValue RHS = Op.getOperand(1);
-
+  SDLoc dl(Op);
  
    // FIXME: We are currently always generating CMPs because we don't support
    // generating CMN through the backend. This is not as good as the natural
@@ -3357,24 +3287,24 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
    default:
      llvm_unreachable("Unknown overflow instruction!");
    case ISD::SADDO:
-    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
-    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
-    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
      break;
    case ISD::UADDO:
-    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
-    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
-    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
      break;
    case ISD::SSUBO:
-    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
-    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
-    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
      break;
    case ISD::USUBO:
-    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
-    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
-    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
      break;
    } // switch (...)
  
@@ -3392,16 +3322,17 @@ ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
    SDValue ARMcc;
    std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDLoc dl(Op);
    // We use 0 and 1 as false and true values.
-  SDValue TVal = DAG.getConstant(1, MVT::i32);
-  SDValue FVal = DAG.getConstant(0, MVT::i32);
+  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
    EVT VT = Op.getValueType();
  
-  SDValue Overflow = DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, TVal, FVal,
+  SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
                                   ARMcc, CCR, OverflowCmp);
  
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
  }
  
  
@@ -3424,7 +3355,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
      EVT VT = Op.getValueType();
  
-    return getCMOV(SDLoc(Op), VT, SelectTrue, SelectFalse, ARMcc, CCR,
+    return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
                     OverflowCmp, DAG);
    }
  
@@ -3467,19 +3398,13 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
    // undefined bits before doing a full-word comparison with zero.
    Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
-                     DAG.getConstant(1, Cond.getValueType()));
+                     DAG.getConstant(1, dl, Cond.getValueType()));
  
    return DAG.getSelectCC(dl, Cond,
-                         DAG.getConstant(0, Cond.getValueType()),
+                         DAG.getConstant(0, dl, Cond.getValueType()),
                           SelectTrue, SelectFalse, ISD::SETNE);
  }
  
-static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
-  if (CC == ISD::SETNE)
-    return ISD::SETEQ;
-  return ISD::getSetCCInverse(CC, true);
-}
-
  static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
                                   bool &swpCmpOps, bool &swpVselOps) {
    // Start by selecting the GE condition code for opcodes that return true for
@@ -3571,7 +3496,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
      // If softenSetCCOperands only returned one value, we should compare it to
      // zero.
      if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
        CC = ISD::SETNE;
      }
    }
@@ -3587,12 +3512,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
      // inverting the compare condition, swapping 'less' and 'greater') and
      // sometimes need to swap the operands to the VSEL (which inverts the
      // condition in the sense of firing whenever the previous condition didn't)
-    if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
-                                      TrueVal.getValueType() == MVT::f64)) {
+    if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
        ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
        if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
            CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
-        CC = getInverseCCForVSEL(CC);
+        CC = ISD::getSetCCInverse(CC, true);
          std::swap(TrueVal, FalseVal);
        }
      }
@@ -3606,21 +3531,114 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
    ARMCC::CondCodes CondCode, CondCode2;
    FPCCToARMCC(CC, CondCode, CondCode2);
  
-  // Try to generate VSEL on ARMv8.
-  if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
-                                    TrueVal.getValueType() == MVT::f64)) {
-    // We can select VMAXNM/VMINNM from a compare followed by a select with the
+  // Try to generate VMAXNM/VMINNM on ARMv8.
+  if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                  TrueVal.getValueType() == MVT::f64)) {
+    // We can use VMAXNM/VMINNM for a compare followed by a select with the
      // same operands, as follows:
-    //   c = fcmp [ogt, olt, ugt, ult] a, b
+    //   c = fcmp [?gt, ?ge, ?lt, ?le] a, b
      //   select c, a, b
-    // We only do this in unsafe-fp-math, because signed zeros and NaNs are
-    // handled differently than the original code sequence.
-    if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal &&
-        RHS == FalseVal) {
-      if (CC == ISD::SETOGT || CC == ISD::SETUGT)
-        return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
-      if (CC == ISD::SETOLT || CC == ISD::SETULT)
-        return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+    // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'.
+    bool swapSides = false;
+    if (!getTargetMachine().Options.NoNaNsFPMath) {
+      // transformability may depend on which way around we compare
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        // the non-NaN should be RHS
+        swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS);
+        break;
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+        // the non-NaN should be LHS
+        swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS);
+        break;
+      }
+    }
+    swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal);
+    if (swapSides) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(LHS, RHS);
+    }
+    if (LHS == TrueVal && RHS == FalseVal) {
+      bool canTransform = true;
+      // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here
+      if (!getTargetMachine().Options.UnsafeFPMath &&
+          !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+        const ConstantFPSDNode *Zero;
+        switch (CC) {
+        default:
+          break;
+        case ISD::SETOGT:
+        case ISD::SETUGT:
+        case ISD::SETGT:
+          // RHS must not be -0
+          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
+                         !Zero->isNegative();
+          break;
+        case ISD::SETOGE:
+        case ISD::SETUGE:
+        case ISD::SETGE:
+          // LHS must not be -0
+          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
+                         !Zero->isNegative();
+          break;
+        case ISD::SETOLT:
+        case ISD::SETULT:
+        case ISD::SETLT:
+          // RHS must not be +0
+          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) &&
+                          Zero->isNegative();
+          break;
+        case ISD::SETOLE:
+        case ISD::SETULE:
+        case ISD::SETLE:
+          // LHS must not be +0
+          canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) &&
+                          Zero->isNegative();
+          break;
+        }
+      }
+      if (canTransform) {
+        // Note: If one of the elements in a pair is a number and the other
+        // element is NaN, the corresponding result element is the number.
+        // This is consistent with the IEEE 754-2008 standard.
+        // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN
+        switch (CC) {
+        default:
+          break;
+        case ISD::SETOGT:
+        case ISD::SETOGE:
+          if (!DAG.isKnownNeverNaN(RHS))
+            break;
+          return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS);
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+          if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        case ISD::SETGT:
+        case ISD::SETGE:
+          return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS);
+        case ISD::SETOLT:
+        case ISD::SETOLE:
+          if (!DAG.isKnownNeverNaN(RHS))
+            break;
+          return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS);
+        case ISD::SETULT:
+        case ISD::SETULE:
+          if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        case ISD::SETLT:
+        case ISD::SETLE:
+          return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS);
+        }
+      }
      }
  
      bool swpCmpOps = false;
@@ -3636,12 +3654,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
      }
    }
  
-  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
    SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
    SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
    if (CondCode2 != ARMCC::AL) {
-    SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
+    SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
      // FIXME: Needs another CMP because flag can have but one use.
      SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
      Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
@@ -3674,7 +3692,7 @@ static bool canChangeToInt(SDValue Op, bool &SeenZero,
  
  static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
    if (isFloatingPointZero(Op))
-    return DAG.getConstant(0, MVT::i32);
+    return DAG.getConstant(0, SDLoc(Op), MVT::i32);
  
    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
      return DAG.getLoad(MVT::i32, SDLoc(Op),
@@ -3687,15 +3705,17 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
  
  static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
                             SDValue &RetVal1, SDValue &RetVal2) {
+  SDLoc dl(Op);
+
    if (isFloatingPointZero(Op)) {
-    RetVal1 = DAG.getConstant(0, MVT::i32);
-    RetVal2 = DAG.getConstant(0, MVT::i32);
+    RetVal1 = DAG.getConstant(0, dl, MVT::i32);
+    RetVal2 = DAG.getConstant(0, dl, MVT::i32);
      return;
    }
  
    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
      SDValue Ptr = Ld->getBasePtr();
-    RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
+    RetVal1 = DAG.getLoad(MVT::i32, dl,
                            Ld->getChain(), Ptr,
                            Ld->getPointerInfo(),
                            Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3703,9 +3723,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
  
      EVT PtrType = Ptr.getValueType();
      unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
-    SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
-                                 PtrType, Ptr, DAG.getConstant(4, PtrType));
-    RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
+    SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
+                                 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
+    RetVal2 = DAG.getLoad(MVT::i32, dl,
                            Ld->getChain(), NewPtr,
                            Ld->getPointerInfo().getWithOffset(4),
                            Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3740,7 +3760,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
      else if (CC == ISD::SETUNE)
        CC = ISD::SETNE;
  
-    SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32);
+    SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
      SDValue ARMcc;
      if (LHS.getValueType() == MVT::f32) {
        LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
@@ -3760,7 +3780,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
      LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
      RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
-    ARMcc = DAG.getConstant(CondCode, MVT::i32);
+    ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
      SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
      SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
      return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
@@ -3784,7 +3804,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
      // If softenSetCCOperands only returned one value, we should compare it to
      // zero.
      if (!RHS.getNode()) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
        CC = ISD::SETNE;
      }
    }
@@ -3810,14 +3830,14 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
    ARMCC::CondCodes CondCode, CondCode2;
    FPCCToARMCC(CC, CondCode, CondCode2);
  
-  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
    SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
    SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
    SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
    if (CondCode2 != ARMCC::AL) {
-    ARMcc = DAG.getConstant(CondCode2, MVT::i32);
+    ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
      SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
      Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
    }
@@ -3832,11 +3852,9 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
  
    EVT PTy = getPointerTy();
    JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
-  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
-  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
    SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
-  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
-  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
+  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
+  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
    if (Subtarget->isThumb2()) {
      // Thumb2 uses a two-level jump. That is, it jumps into the jump table
@@ -3844,7 +3862,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
      // to translate it to TBB / TBH later.
      // FIXME: This might not work if the function is extremely large.
      return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
-                       Addr, Op.getOperand(2), JTI, UId);
+                       Addr, Op.getOperand(2), JTI);
    }
    if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
      Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
@@ -3852,13 +3870,13 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
                         false, false, false, 0);
      Chain = Addr.getValue(1);
      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
-    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
    } else {
      Addr = DAG.getLoad(PTy, dl, Chain, Addr,
                         MachinePointerInfo::getJumpTable(),
                         false, false, false, 0);
      Chain = Addr.getValue(1);
-    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
    }
  }
  
@@ -3885,7 +3903,6 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    if (VT.isVector())
      return LowerVectorFP_TO_INT(Op, DAG);
-
    if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
      RTLIB::Libcall LC;
      if (Op.getOpcode() == ISD::FP_TO_SINT)
@@ -3898,20 +3915,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                         /*isSigned*/ false, SDLoc(Op)).first;
    }
  
-  SDLoc dl(Op);
-  unsigned Opc;
-
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid opcode!");
-  case ISD::FP_TO_SINT:
-    Opc = ARMISD::FTOSI;
-    break;
-  case ISD::FP_TO_UINT:
-    Opc = ARMISD::FTOUI;
-    break;
-  }
-  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
-  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+  return Op;
  }
  
  static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -3951,7 +3955,6 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    if (VT.isVector())
      return LowerVectorINT_TO_FP(Op, DAG);
-
    if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
      RTLIB::Libcall LC;
      if (Op.getOpcode() == ISD::SINT_TO_FP)
@@ -3964,21 +3967,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
                         /*isSigned*/ false, SDLoc(Op)).first;
    }
  
-  SDLoc dl(Op);
-  unsigned Opc;
-
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid opcode!");
-  case ISD::SINT_TO_FP:
-    Opc = ARMISD::SITOF;
-    break;
-  case ISD::UINT_TO_FP:
-    Opc = ARMISD::UITOF;
-    break;
-  }
-
-  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
-  return DAG.getNode(Opc, dl, VT, Op);
+  return Op;
  }
  
  SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
@@ -3996,12 +3985,12 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
      // Use VBSL to copy the sign bit.
      unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
      SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
-                               DAG.getTargetConstant(EncodedVal, MVT::i32));
+                               DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
      EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
      if (VT == MVT::f64)
        Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
                           DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
-                         DAG.getConstant(32, MVT::i32));
+                         DAG.getConstant(32, dl, MVT::i32));
      else /*if (VT == MVT::f32)*/
        Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
      if (SrcVT == MVT::f32) {
@@ -4009,16 +3998,16 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
        if (VT == MVT::f64)
          Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
                             DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
-                           DAG.getConstant(32, MVT::i32));
+                           DAG.getConstant(32, dl, MVT::i32));
      } else if (VT == MVT::f32)
        Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
                           DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
-                         DAG.getConstant(32, MVT::i32));
+                         DAG.getConstant(32, dl, MVT::i32));
      Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
      Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
  
      SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
-                                            MVT::i32);
+                                            dl, MVT::i32);
      AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
      SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
                                    DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
@@ -4029,7 +4018,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
      if (VT == MVT::f32) {
        Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
        Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
-                        DAG.getConstant(0, MVT::i32));
+                        DAG.getConstant(0, dl, MVT::i32));
      } else {
        Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
      }
@@ -4044,8 +4033,8 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
    Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
  
    // Or in the signbit with integer operations.
-  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
-  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+  SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
+  SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
    Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
    if (VT == MVT::f32) {
      Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
@@ -4076,7 +4065,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    if (Depth) {
      SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
      return DAG.getLoad(VT, dl, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
                         MachinePointerInfo(), false, false, false, 0);
@@ -4115,7 +4104,28 @@ unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
                         .Default(0);
    if (Reg)
      return Reg;
-  report_fatal_error("Invalid register name global variable");
+  report_fatal_error(Twine("Invalid register name \""
+                              + StringRef(RegName)  + "\"."));
+}
+
+// Result is 64 bit value so split into two 32 bit values and return as a
+// pair of values.
+static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // This function is only supposed to be called for i64 type destination.
+  assert(N->getValueType(0) == MVT::i64
+          && "ExpandREAD_REGISTER called for non-i64 type result.");
+
+  SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
+                             DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
+                             N->getOperand(0),
+                             N->getOperand(1));
+
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
+                    Read.getValue(1)));
+  Results.push_back(Read.getOperand(0));
  }
  
  /// ExpandBITCAST - If the target supports VFP, this function is called to
@@ -4138,9 +4148,9 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
    // Turn i64->f64 into VMOVDRR.
    if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
      SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
-                             DAG.getConstant(0, MVT::i32));
+                             DAG.getConstant(0, dl, MVT::i32));
      SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
-                             DAG.getConstant(1, MVT::i32));
+                             DAG.getConstant(1, dl, MVT::i32));
      return DAG.getNode(ISD::BITCAST, dl, DstVT,
                         DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
    }
@@ -4172,7 +4182,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
  static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
    // The canonical modified immediate encoding of a zero vector is....0!
-  SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
+  SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
    EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
    SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
    return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -4195,17 +4205,17 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
    assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
-                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+                                 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i32));
+                                   DAG.getConstant(VTBits, dl, MVT::i32));
    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
  
    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
-                          ARMcc, DAG, dl);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+                          ISD::SETGE, ARMcc, DAG, dl);
    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
    SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
                             CCR, Cmp);
@@ -4229,17 +4239,17 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
  
    assert(Op.getOpcode() == ISD::SHL_PARTS);
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
-                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+                                 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i32));
+                                   DAG.getConstant(VTBits, dl, MVT::i32));
    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
    SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
  
    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
-                          ARMcc, DAG, dl);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+                          ISD::SETGE, ARMcc, DAG, dl);
    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
    SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
                             CCR, Cmp);
@@ -4256,14 +4266,14 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
    // so that the shift + and get folded into a bitfield extract.
    SDLoc dl(Op);
    SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                              DAG.getConstant(Intrinsic::arm_get_fpscr,
+                              DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
                                                MVT::i32));
    SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
-                                  DAG.getConstant(1U << 22, MVT::i32));
+                                  DAG.getConstant(1U << 22, dl, MVT::i32));
    SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
-                              DAG.getConstant(22, MVT::i32));
+                              DAG.getConstant(22, dl, MVT::i32));
    return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
-                     DAG.getConstant(3, MVT::i32));
+                     DAG.getConstant(3, dl, MVT::i32));
  }
  
  static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
@@ -4321,10 +4331,10 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
    if (VT.is64BitVector()) {
      SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
-                       DAG.getIntPtrConstant(0));
+                       DAG.getIntPtrConstant(0, DL));
    } else {
      SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
-                                    BitCounts, DAG.getIntPtrConstant(0));
+                                    BitCounts, DAG.getIntPtrConstant(0, DL));
      return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
    }
  }
@@ -4363,10 +4373,10 @@ static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
    if (VT.is64BitVector()) {
      SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
-                       DAG.getIntPtrConstant(0));
+                       DAG.getIntPtrConstant(0, DL));
    } else {
      SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
-                                    DAG.getIntPtrConstant(0));
+                                    DAG.getIntPtrConstant(0, DL));
      return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
    }
  }
@@ -4400,7 +4410,8 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
    // Left shifts translate directly to the vshiftu intrinsic.
    if (N->getOpcode() == ISD::SHL)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
+                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
+                                       MVT::i32),
                         N->getOperand(0), N->getOperand(1));
  
    assert((N->getOpcode() == ISD::SRA ||
@@ -4417,7 +4428,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                               Intrinsic::arm_neon_vshifts :
                               Intrinsic::arm_neon_vshiftu);
    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(vshiftInt, MVT::i32),
+                     DAG.getConstant(vshiftInt, dl, MVT::i32),
                       N->getOperand(0), NegatedCount);
  }
  
@@ -4443,9 +4454,9 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
  
    // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
-                           DAG.getConstant(0, MVT::i32));
+                           DAG.getConstant(0, dl, MVT::i32));
    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
-                           DAG.getConstant(1, MVT::i32));
+                           DAG.getConstant(1, dl, MVT::i32));
  
    // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
    // captures the result into a carry flag.
@@ -4468,6 +4479,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
    SDValue Op0 = Op.getOperand(0);
    SDValue Op1 = Op.getOperand(1);
    SDValue CC = Op.getOperand(2);
+  EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
    EVT VT = Op.getValueType();
    ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
    SDLoc dl(Op);
@@ -4497,8 +4509,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
        TmpOp0 = Op0;
        TmpOp1 = Op1;
        Opc = ISD::OR;
-      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
-      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
        break;
      case ISD::SETUO: Invert = true; // Fallthrough
      case ISD::SETO:
@@ -4506,8 +4518,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
        TmpOp0 = Op0;
        TmpOp1 = Op1;
        Opc = ISD::OR;
-      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
-      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
        break;
      }
    } else {
@@ -4541,8 +4553,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
  
        if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
          Opc = ARMISD::VTST;
-        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
-        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
+        Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
          Invert = !Invert;
        }
      }
@@ -4568,22 +4580,24 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
    if (SingleOp.getNode()) {
      switch (Opc) {
      case ARMISD::VCEQ:
-      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
      case ARMISD::VCGE:
-      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
      case ARMISD::VCLEZ:
-      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
      case ARMISD::VCGT:
-      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
      case ARMISD::VCLTZ:
-      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
      default:
-      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
      }
    } else {
-     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+     Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
    }
  
+  Result = DAG.getSExtOrTrunc(Result, dl, VT);
+
    if (Invert)
      Result = DAG.getNOT(dl, Result, VT);
  
@@ -4595,7 +4609,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
  /// operand (e.g., VMOV).  If so, return the encoded value.
  static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                   unsigned SplatBitSize, SelectionDAG &DAG,
-                                 EVT &VT, bool is128Bits, NEONModImmType type) {
+                                 SDLoc dl, EVT &VT, bool is128Bits,
+                                 NEONModImmType type) {
    unsigned OpCmode, Imm;
  
    // SplatBitSize is set to the smallest size that splats the vector, so a
@@ -4725,7 +4740,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
    }
  
    unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
-  return DAG.getTargetConstant(EncodedVal, MVT::i32);
+  return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
  }
  
  SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
@@ -4755,11 +4770,11 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
      // It's a float and we are trying to use NEON operations where
      // possible. Lower it to a splat followed by an extract.
      SDLoc DL(Op);
-    SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
+    SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
      SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
                                        NewVal);
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
-                       DAG.getConstant(0, MVT::i32));
+                       DAG.getConstant(0, DL, MVT::i32));
    }
  
    // The rest of our options are NEON only, make sure that's allowed before
@@ -4777,8 +4792,8 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
      return SDValue();
  
    // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
-  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
-                                     false, VMOVModImm);
+  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
+                                     VMovVT, false, VMOVModImm);
    if (NewVal != SDValue()) {
      SDLoc DL(Op);
      SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
@@ -4790,11 +4805,11 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
      SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                         VecConstant);
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
-                       DAG.getConstant(0, MVT::i32));
+                       DAG.getConstant(0, DL, MVT::i32));
    }
  
    // Finally, try a VMVN.i32
-  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
                               false, VMVNModImm);
    if (NewVal != SDValue()) {
      SDLoc DL(Op);
@@ -4807,7 +4822,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
      SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                         VecConstant);
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
-                       DAG.getConstant(0, MVT::i32));
+                       DAG.getConstant(0, DL, MVT::i32));
    }
  
    return SDValue();
@@ -5070,10 +5085,10 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
  
    if (ST->isThumb1Only()) {
      if (Val <= 255 || ~Val <= 255)
-      return DAG.getConstant(Val, MVT::i32);
+      return DAG.getConstant(Val, dl, MVT::i32);
    } else {
      if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
-      return DAG.getConstant(Val, MVT::i32);
+      return DAG.getConstant(Val, dl, MVT::i32);
    }
    return SDValue();
  }
@@ -5095,7 +5110,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        EVT VmovVT;
        SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                        SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VmovVT, VT.is128BitVector(),
+                                      DAG, dl, VmovVT, VT.is128BitVector(),
                                        VMOVModImm);
        if (Val.getNode()) {
          SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
@@ -5106,7 +5121,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        uint64_t NegatedImm = (~SplatBits).getZExtValue();
        Val = isNEONModifiedImm(NegatedImm,
                                        SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VmovVT, VT.is128BitVector(),
+                                      DAG, dl, VmovVT, VT.is128BitVector(),
                                        VMVNModImm);
        if (Val.getNode()) {
          SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
@@ -5117,7 +5132,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
          int ImmVal = ARM_AM::getFP32Imm(SplatBits);
          if (ImmVal != -1) {
-          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
+          SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
            return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
          }
        }
@@ -5199,8 +5214,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               VT.getVectorNumElements();
            N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                   DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
-                        Value, DAG.getConstant(index, MVT::i32)),
-                           DAG.getConstant(index, MVT::i32));
+                        Value, DAG.getConstant(index, dl, MVT::i32)),
+                           DAG.getConstant(index, dl, MVT::i32));
          } else
            N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                          Value->getOperand(0), Value->getOperand(1));
@@ -5216,7 +5231,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
            SmallVector<SDValue, 3> Ops;
            Ops.push_back(N);
            Ops.push_back(Op.getOperand(I));
-          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
            N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
          }
        }
@@ -5280,7 +5295,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        SDValue V = Op.getOperand(i);
        if (V.getOpcode() == ISD::UNDEF)
          continue;
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
      }
      return Vec;
@@ -5383,24 +5398,25 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
        VEXTOffsets[i] = NumElts;
        ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                     SourceVecs[i],
-                                   DAG.getIntPtrConstant(NumElts));
+                                   DAG.getIntPtrConstant(NumElts, dl));
      } else if (MaxElts[i] < NumElts) {
        // The extraction can just take the first half
        VEXTOffsets[i] = 0;
        ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                     SourceVecs[i],
-                                   DAG.getIntPtrConstant(0));
+                                   DAG.getIntPtrConstant(0, dl));
      } else {
        // An actual VEXT is needed
        VEXTOffsets[i] = MinElts[i];
        SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                       SourceVecs[i],
-                                     DAG.getIntPtrConstant(0));
+                                     DAG.getIntPtrConstant(0, dl));
        SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                       SourceVecs[i],
-                                     DAG.getIntPtrConstant(NumElts));
+                                     DAG.getIntPtrConstant(NumElts, dl));
        ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
+                                   DAG.getConstant(VEXTOffsets[i], dl,
+                                                   MVT::i32));
      }
    }
  
@@ -5534,13 +5550,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
    case OP_VDUP2:
    case OP_VDUP3:
      return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
-                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
+                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
    case OP_VEXT1:
    case OP_VEXT2:
    case OP_VEXT3:
      return DAG.getNode(ARMISD::VEXT, dl, VT,
                         OpLHS, OpRHS,
-                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
+                       DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
    case OP_VUZPL:
    case OP_VUZPR:
      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
@@ -5567,7 +5583,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
    SmallVector<SDValue, 8> VTBLMask;
    for (ArrayRef<int>::iterator
           I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
-    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
+    VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
  
    if (V2.getNode()->getOpcode() == ISD::UNDEF)
      return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
@@ -5591,7 +5607,7 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
    // into the bottom double word. The v8i16 case is similar.
    unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
    return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
-                     DAG.getConstant(ExtractNum, MVT::i32));
+                     DAG.getConstant(ExtractNum, DL, MVT::i32));
  }
  
  static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
@@ -5635,7 +5651,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
            return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
        }
        return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
-                         DAG.getConstant(Lane, MVT::i32));
+                         DAG.getConstant(Lane, dl, MVT::i32));
      }
  
      bool ReverseVEXT;
@@ -5644,7 +5660,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
        if (ReverseVEXT)
          std::swap(V1, V2);
        return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
-                         DAG.getConstant(Imm, MVT::i32));
+                         DAG.getConstant(Imm, dl, MVT::i32));
      }
  
      if (isVREVMask(ShuffleMask, VT, 64))
@@ -5657,7 +5673,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      if (V2->getOpcode() == ISD::UNDEF &&
          isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
        return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
-                         DAG.getConstant(Imm, MVT::i32));
+                         DAG.getConstant(Imm, dl, MVT::i32));
      }
  
      // Check for Neon shuffles that modify both input vectors in place.
@@ -5725,7 +5741,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
          Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
                                    ShuffleMask[i] < (int)NumElts ? V1 : V2,
                                    DAG.getConstant(ShuffleMask[i] & (NumElts-1),
-                                                  MVT::i32)));
+                                                  dl, MVT::i32)));
      }
      SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
      return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -5780,11 +5796,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
    if (Op0.getOpcode() != ISD::UNDEF)
      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                        DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
-                      DAG.getIntPtrConstant(0));
+                      DAG.getIntPtrConstant(0, dl));
    if (Op1.getOpcode() != ISD::UNDEF)
      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                        DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
-                      DAG.getIntPtrConstant(1));
+                      DAG.getIntPtrConstant(1, dl));
    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
  }
  
@@ -5956,14 +5972,15 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
    unsigned NumElts = VT.getVectorNumElements();
    MVT TruncVT = MVT::getIntegerVT(EltSize);
    SmallVector<SDValue, 8> Ops;
+  SDLoc dl(N);
    for (unsigned i = 0; i != NumElts; ++i) {
      ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
      const APInt &CInt = C->getAPIntValue();
      // Element types smaller than 32 bits are not legal, so use i32 elements.
      // The values are implicitly truncated so sext vs. zext doesn't matter.
-    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
    }
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,
                       MVT::getVectorVT(TruncVT, NumElts), Ops);
  }
  
@@ -6076,14 +6093,15 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
    // Get reciprocal estimate.
    // float4 recip = vrecpeq_f32(yf);
    Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+                   Y);
    // Because char has a smaller range than uchar, we can actually get away
    // without any newton steps.  This requires that we use a weird bias
    // of 0xb000, however (again, this has been exhaustively tested).
    // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
    X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
    X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
-  Y = DAG.getConstant(0xb000, MVT::i32);
+  Y = DAG.getConstant(0xb000, dl, MVT::i32);
    Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
    X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
    X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
@@ -6108,9 +6126,10 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
    // float4 recip = vrecpeq_f32(yf);
    // recip *= vrecpsq_f32(yf, recip);
    N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+                   N1);
    N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
                     N1, N2);
    N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
    // Because short has a smaller range than ushort, we can actually get away
@@ -6119,7 +6138,7 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
    // float4 result = as_float4(as_int4(xf*recip) + 0x89);
    N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
    N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(0x89, MVT::i32);
+  N1 = DAG.getConstant(0x89, dl, MVT::i32);
    N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
    N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
    N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
@@ -6145,13 +6164,13 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
      N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
  
      N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
-                     DAG.getIntPtrConstant(4));
+                     DAG.getIntPtrConstant(4, dl));
      N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(4));
+                     DAG.getIntPtrConstant(4, dl));
      N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
-                     DAG.getIntPtrConstant(0));
+                     DAG.getIntPtrConstant(0, dl));
      N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(0));
+                     DAG.getIntPtrConstant(0, dl));
  
      N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
      N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
@@ -6180,13 +6199,13 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
      N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
  
      N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
-                     DAG.getIntPtrConstant(4));
+                     DAG.getIntPtrConstant(4, dl));
      N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(4));
+                     DAG.getIntPtrConstant(4, dl));
      N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
-                     DAG.getIntPtrConstant(0));
+                     DAG.getIntPtrConstant(0, dl));
      N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(0));
+                     DAG.getIntPtrConstant(0, dl));
  
      N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
      N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
@@ -6195,7 +6214,8 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
      N0 = LowerCONCAT_VECTORS(N0, DAG);
  
      N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
-                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
+                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
+                                     MVT::i32),
                       N0);
      return N0;
    }
@@ -6213,13 +6233,14 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
    // recip *= vrecpsq_f32(yf, recip);
    // recip *= vrecpsq_f32(yf, recip);
    N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+                   BN1);
    N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
                     BN1, N2);
    N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
    N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
                     BN1, N2);
    N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
    // Simply multiplying by the reciprocal estimate can leave us a few ulps
@@ -6228,7 +6249,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
    // float4 result = as_float4(as_int4(xf*recip) + 2);
    N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
    N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(2, MVT::i32);
+  N1 = DAG.getConstant(2, dl, MVT::i32);
    N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
    N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
    N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
@@ -6274,7 +6295,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
    // Pair of floats / doubles used to pass the result.
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
  
    // Create stack object for sret.
    const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
@@ -6315,7 +6336,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
  
    // Address of cos field.
    SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
-                            DAG.getIntPtrConstant(ArgVT.getStoreSize()));
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
    SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
                                  MachinePointerInfo(), false, false, false, 0);
  
@@ -6345,12 +6366,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
      // Under Power Management extensions, the cycle-count is:
      //    mrc p15, #0, <Rt>, c9, c13, #0
      SDValue Ops[] = { N->getOperand(0), // Chain
-                      DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
-                      DAG.getConstant(15, MVT::i32),
-                      DAG.getConstant(0, MVT::i32),
-                      DAG.getConstant(9, MVT::i32),
-                      DAG.getConstant(13, MVT::i32),
-                      DAG.getConstant(0, MVT::i32)
+                      DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                      DAG.getConstant(15, DL, MVT::i32),
+                      DAG.getConstant(0, DL, MVT::i32),
+                      DAG.getConstant(9, DL, MVT::i32),
+                      DAG.getConstant(13, DL, MVT::i32),
+                      DAG.getConstant(0, DL, MVT::i32)
      };
  
      Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
@@ -6360,13 +6381,13 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
      // Intrinsic is defined to return 0 on unsupported platforms. Technically
      // there are older ARM CPUs that have implementation-specific ways of
      // obtaining this information (FIXME!).
-    Cycles32 = DAG.getConstant(0, MVT::i32);
+    Cycles32 = DAG.getConstant(0, DL, MVT::i32);
      OutChain = DAG.getEntryNode();
    }
  
  
    SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
-                                 Cycles32, DAG.getConstant(0, MVT::i32));
+                                 Cycles32, DAG.getConstant(0, DL, MVT::i32));
    Results.push_back(Cycles64);
    Results.push_back(OutChain);
  }
@@ -6374,6 +6395,7 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
  SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
    case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
    case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
    case ISD::GlobalAddress:
@@ -6458,6 +6480,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
    switch (N->getOpcode()) {
    default:
      llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::READ_REGISTER:
+    ExpandREAD_REGISTER(N, Results, DAG);
+    break;
    case ISD::BITCAST:
      Res = ExpandBITCAST(N, DAG);
      break;
@@ -6482,8 +6507,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
  void ARMTargetLowering::
  SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
                         MachineBasicBlock *DispatchBB, int FI) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    DebugLoc dl = MI->getDebugLoc();
    MachineFunction *MF = MBB->getParent();
    MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6500,9 +6524,8 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
      ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
    unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
  
-  const TargetRegisterClass *TRC = isThumb ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
+                                           : &ARM::GPRRegClass;
  
    // Grab constant pool and fixed stack memory operands.
    MachineMemOperand *CPMMO =
@@ -6566,9 +6589,9 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
                     .addReg(NewVReg2, RegState::Kill)
                     .addReg(NewVReg3, RegState::Kill));
      unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
-                   .addFrameIndex(FI)
-                   .addImm(36)); // &jbuf[1] :: pc
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
+            .addFrameIndex(FI)
+            .addImm(36); // &jbuf[1] :: pc
      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
                     .addReg(NewVReg4, RegState::Kill)
                     .addReg(NewVReg5, RegState::Kill)
@@ -6596,20 +6619,17 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
    }
  }
  
-MachineBasicBlock *ARMTargetLowering::
-EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
+                                              MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    DebugLoc dl = MI->getDebugLoc();
    MachineFunction *MF = MBB->getParent();
    MachineRegisterInfo *MRI = &MF->getRegInfo();
-  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
    MachineFrameInfo *MFI = MF->getFrameInfo();
    int FI = MFI->getFunctionContextIndex();
  
-  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
+                                                        : &ARM::GPRnopcRegClass;
  
    // Get a mapping of the call site numbers to all of the landing pads they're
    // associated with.
@@ -6660,7 +6680,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
    MachineJumpTableInfo *JTI =
      MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
    unsigned MJTI = JTI->createJumpTableIndex(LPadList);
-  unsigned UId = AFI->createJumpTableUId();
    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
  
    // Create the MBBs for the dispatch code.
@@ -6743,8 +6762,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
      unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
-                   .addJumpTableIndex(MJTI)
-                   .addImm(UId));
+                   .addJumpTableIndex(MJTI));
  
      unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
      AddDefaultCC(
@@ -6757,8 +6775,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
      BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
        .addReg(NewVReg4, RegState::Kill)
        .addReg(NewVReg1)
-      .addJumpTableIndex(MJTI)
-      .addImm(UId);
+      .addJumpTableIndex(MJTI);
    } else if (Subtarget->isThumb()) {
      unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
@@ -6803,8 +6820,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
      unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
-                   .addJumpTableIndex(MJTI)
-                   .addImm(UId));
+                   .addJumpTableIndex(MJTI));
  
      unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
@@ -6833,8 +6849,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
      BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
        .addReg(NewVReg6, RegState::Kill)
-      .addJumpTableIndex(MJTI)
-      .addImm(UId);
+      .addJumpTableIndex(MJTI);
    } else {
      unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
@@ -6895,8 +6910,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                       .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
      unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
-                   .addJumpTableIndex(MJTI)
-                   .addImm(UId));
+                   .addJumpTableIndex(MJTI));
  
      MachineMemOperand *JTMMOLd =
        MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
@@ -6913,13 +6927,11 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
        BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
          .addReg(NewVReg5, RegState::Kill)
          .addReg(NewVReg4)
-        .addJumpTableIndex(MJTI)
-        .addImm(UId);
+        .addJumpTableIndex(MJTI);
      } else {
        BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
          .addReg(NewVReg5, RegState::Kill)
-        .addJumpTableIndex(MJTI)
-        .addImm(UId);
+        .addJumpTableIndex(MJTI);
      }
    }
  
@@ -6928,7 +6940,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
    for (std::vector<MachineBasicBlock*>::iterator
           I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
      MachineBasicBlock *CurMBB = *I;
-    if (SeenMBBs.insert(CurMBB))
+    if (SeenMBBs.insert(CurMBB).second)
        DispContBB->addSuccessor(CurMBB);
    }
  
@@ -6995,8 +7007,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
  
    // The instruction is gone now.
    MI->eraseFromParent();
-
-  return MBB;
  }
  
  static
@@ -7114,8 +7124,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
    // This pseudo instruction has 3 operands: dst, src, size
    // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
    // Otherwise, we will generate unrolled scalar copies.
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
    MachineFunction::iterator It = BB;
    ++It;
@@ -7141,9 +7150,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
      UnitSize = 2;
    } else {
      // Check whether we can use NEON instructions.
-    if (!MF->getFunction()->getAttributes().
-          hasAttribute(AttributeSet::FunctionIndex,
-                       Attribute::NoImplicitFloat) &&
+    if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
          Subtarget->hasNEON()) {
        if ((Align % 16 == 0) && SizeVal >= 16)
          UnitSize = 16;
@@ -7157,14 +7164,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
  
    // Select the correct opcode and register class for unit size load/store
    bool IsNeon = UnitSize >= 8;
-  TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass
-                               : (const TargetRegisterClass *)&ARM::GPRRegClass;
+  TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
    if (IsNeon)
-    VecTRC = UnitSize == 16
-                 ? (const TargetRegisterClass *)&ARM::DPairRegClass
-                 : UnitSize == 8
-                       ? (const TargetRegisterClass *)&ARM::DPRRegClass
-                       : nullptr;
+    VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
+                            : UnitSize == 8 ? &ARM::DPRRegClass
+                                            : nullptr;
  
    unsigned BytesLeft = SizeVal % UnitSize;
    unsigned LoopSize = SizeVal - BytesLeft;
@@ -7237,16 +7241,20 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
  
    // Load an immediate to varEnd.
    unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (IsThumb2) {
+  if (Subtarget->useMovt(*MF)) {
      unsigned Vtmp = varEnd;
      if ((LoopSize & 0xFFFF0000) != 0)
        Vtmp = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
-                       .addImm(LoopSize & 0xFFFF));
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16),
+                           Vtmp).addImm(LoopSize & 0xFFFF));
  
      if ((LoopSize & 0xFFFF0000) != 0)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                         .addReg(Vtmp).addImm(LoopSize >> 16));
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16),
+                             varEnd)
+                         .addReg(Vtmp)
+                         .addImm(LoopSize >> 16));
    } else {
      MachineConstantPool *ConstantPool = MF->getConstantPool();
      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -7349,7 +7357,7 @@ MachineBasicBlock *
  ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
                                         MachineBasicBlock *MBB) const {
    const TargetMachine &TM = getTargetMachine();
-  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
    assert(Subtarget->isTargetWindows() &&
@@ -7414,8 +7422,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
  MachineBasicBlock *
  ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII =
-      getTargetMachine().getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    DebugLoc dl = MI->getDebugLoc();
    bool isThumb2 = Subtarget->isThumb2();
    switch (MI->getOpcode()) {
@@ -7608,13 +7615,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  
      unsigned int ABSSrcReg = MI->getOperand(1).getReg();
      unsigned int ABSDstReg = MI->getOperand(0).getReg();
+    bool ABSSrcKIll = MI->getOperand(1).isKill();
      bool isThumb2 = Subtarget->isThumb2();
      MachineRegisterInfo &MRI = Fn->getRegInfo();
      // In Thumb mode S must not be specified if source register is the SP or
      // PC and if destination register is the SP, so restrict register class
-    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
-      (const TargetRegisterClass*)&ARM::rGPRRegClass :
-      (const TargetRegisterClass*)&ARM::GPRRegClass);
+    unsigned NewRsbDstReg =
+      MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
  
      // Transfer the remainder of BB and its successor edges to sinkMBB.
      SinkBB->splice(SinkBB->begin(), BB,
@@ -7642,7 +7649,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      // by if-conversion pass
      BuildMI(*RSBBB, RSBBB->begin(), dl,
        TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
-      .addReg(ABSSrcReg, RegState::Kill)
+      .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
        .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
  
      // insert PHI in SinkBB,
@@ -7668,12 +7675,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  
  void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                        SDNode *Node) const {
-  if (!MI->hasPostISelHook()) {
-    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
-           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
-    return;
-  }
-
    const MCInstrDesc *MCID = &MI->getDesc();
    // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
    // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
@@ -7685,8 +7686,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
    // Rename pseudo opcodes.
    unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
    if (NewOpc) {
-    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
-        getTargetMachine().getSubtargetImpl()->getInstrInfo());
+    const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
      MCID = &TII->get(NewOpc);
  
      assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
@@ -7790,6 +7790,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
        return false;
      // Fall through.
    case ISD::SIGN_EXTEND: {
+    SDLoc dl(N);
      EVT VT = N->getValueType(0);
      CC = N->getOperand(0);
      if (CC.getValueType() != MVT::i1)
@@ -7798,12 +7799,13 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
      if (AllOnes)
        // When looking for an AllOnes constant, N is an sext, and the 'other'
        // value is 0.
-      OtherOp = DAG.getConstant(0, VT);
+      OtherOp = DAG.getConstant(0, dl, VT);
      else if (N->getOpcode() == ISD::ZERO_EXTEND)
        // When looking for a 0 constant, N can be zext or sext.
-      OtherOp = DAG.getConstant(1, VT);
+      OtherOp = DAG.getConstant(1, dl, VT);
      else
-      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
+                                VT);
      return true;
    }
    }
@@ -7942,9 +7944,11 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
    SelectionDAG &DAG = DCI.DAG;
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
+  SDLoc dl(N);
+
    // Build operand list.
    SmallVector<SDValue, 8> Ops;
-  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
                                  TLI.getPointerTy()));
  
    // Input is the vector.
@@ -7963,9 +7967,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
        llvm_unreachable("Invalid vector element type for padd optimization.");
    }
  
-  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, Ops);
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
    unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
-  return DAG.getNode(ExtOp, SDLoc(N), VT, tmp);
+  return DAG.getNode(ExtOp, dl, VT, tmp);
  }
  
  static SDValue findMUL_LOHI(SDValue V) {
@@ -7990,13 +7994,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
    // a glue link from the first add to the second add.
    // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
    // a S/UMLAL instruction.
-  //          loAdd   UMUL_LOHI
-  //            \    / :lo    \ :hi
-  //             \  /          \          [no multiline comment]
-  //              ADDC         |  hiAdd
-  //                 \ :glue  /  /
-  //                  \      /  /
-  //                    ADDE
+  //                  UMUL_LOHI
+  //                 / :lo    \ :hi
+  //                /          \          [no multiline comment]
+  //    loAdd ->  ADDE         |
+  //                 \ :glue  /
+  //                  \      /
+  //                    ADDC   <- hiAdd
    //
    assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
    SDValue AddcOp0 = AddcNode->getOperand(0);
@@ -8050,29 +8054,35 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
    else
      IsLeftOperandMUL = true;
    if (MULOp == SDValue())
-     return SDValue();
+    return SDValue();
  
    // Figure out the right opcode.
    unsigned Opc = MULOp->getOpcode();
    unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
  
    // Figure out the high and low input values to the MLAL node.
-  SDValue* HiMul = &MULOp;
    SDValue* HiAdd = nullptr;
    SDValue* LoMul = nullptr;
    SDValue* LowAdd = nullptr;
  
+  // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
+  if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+    return SDValue();
+
    if (IsLeftOperandMUL)
      HiAdd = &AddeOp1;
    else
      HiAdd = &AddeOp0;
  
  
-  if (AddcOp0->getOpcode() == Opc) {
+  // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
+  // whose low result is fed to the ADDC we are checking.
+
+  if (AddcOp0 == MULOp.getValue(0)) {
      LoMul = &AddcOp0;
      LowAdd = &AddcOp1;
    }
-  if (AddcOp1->getOpcode() == Opc) {
+  if (AddcOp1 == MULOp.getValue(0)) {
      LoMul = &AddcOp1;
      LowAdd = &AddcOp0;
    }
@@ -8080,9 +8090,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
    if (!LoMul)
      return SDValue();
  
-  if (LoMul->getNode() != HiMul->getNode())
-    return SDValue();
-
    // Create the merged node.
    SelectionDAG &DAG = DCI.DAG;
  
@@ -8256,14 +8263,14 @@ static SDValue PerformMULCombine(SDNode *N,
                          V,
                          DAG.getNode(ISD::SHL, DL, VT,
                                      V,
-                                    DAG.getConstant(Log2_32(MulAmt - 1),
+                                    DAG.getConstant(Log2_32(MulAmt - 1), DL,
                                                      MVT::i32)));
      } else if (isPowerOf2_32(MulAmt + 1)) {
        // (mul x, 2^N - 1) => (sub (shl x, N), x)
        Res = DAG.getNode(ISD::SUB, DL, VT,
                          DAG.getNode(ISD::SHL, DL, VT,
                                      V,
-                                    DAG.getConstant(Log2_32(MulAmt + 1),
+                                    DAG.getConstant(Log2_32(MulAmt + 1), DL,
                                                      MVT::i32)),
                          V);
      } else
@@ -8276,7 +8283,7 @@ static SDValue PerformMULCombine(SDNode *N,
                          V,
                          DAG.getNode(ISD::SHL, DL, VT,
                                      V,
-                                    DAG.getConstant(Log2_32(MulAmtAbs + 1),
+                                    DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
                                                      MVT::i32)));
      } else if (isPowerOf2_32(MulAmtAbs - 1)) {
        // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
@@ -8284,10 +8291,10 @@ static SDValue PerformMULCombine(SDNode *N,
                          V,
                          DAG.getNode(ISD::SHL, DL, VT,
                                      V,
-                                    DAG.getConstant(Log2_32(MulAmtAbs-1),
+                                    DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
                                                      MVT::i32)));
        Res = DAG.getNode(ISD::SUB, DL, VT,
-                        DAG.getConstant(0, MVT::i32),Res);
+                        DAG.getConstant(0, DL, MVT::i32), Res);
  
      } else
        return SDValue();
@@ -8295,7 +8302,7 @@ static SDValue PerformMULCombine(SDNode *N,
  
    if (ShiftAmt != 0)
      Res = DAG.getNode(ISD::SHL, DL, VT,
-                      Res, DAG.getConstant(ShiftAmt, MVT::i32));
+                      Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
  
    // Do not add new nodes to DAG combiner worklist.
    DCI.CombineTo(N, Res, false);
@@ -8324,7 +8331,7 @@ static SDValue PerformANDCombine(SDNode *N,
        EVT VbicVT;
        SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
                                        SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VbicVT, VT.is128BitVector(),
+                                      DAG, dl, VbicVT, VT.is128BitVector(),
                                        OtherModImm);
        if (Val.getNode()) {
          SDValue Input =
@@ -8367,7 +8374,7 @@ static SDValue PerformORCombine(SDNode *N,
        EVT VorrVT;
        SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                        SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VorrVT, VT.is128BitVector(),
+                                      DAG, dl, VorrVT, VT.is128BitVector(),
                                        OtherModImm);
        if (Val.getNode()) {
          SDValue Input =
@@ -8471,8 +8478,8 @@ static SDValue PerformORCombine(SDNode *N,
        Val >>= countTrailingZeros(~Mask);
  
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
-                        DAG.getConstant(Val, MVT::i32),
-                        DAG.getConstant(Mask, MVT::i32));
+                        DAG.getConstant(Val, DL, MVT::i32),
+                        DAG.getConstant(Mask, DL, MVT::i32));
  
        // Do not add new nodes to DAG combiner worklist.
        DCI.CombineTo(N, Res, false);
@@ -8497,9 +8504,9 @@ static SDValue PerformORCombine(SDNode *N,
        // 2a
        unsigned amt = countTrailingZeros(Mask2);
        Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
-                        DAG.getConstant(amt, MVT::i32));
+                        DAG.getConstant(amt, DL, MVT::i32));
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
-                        DAG.getConstant(Mask, MVT::i32));
+                        DAG.getConstant(Mask, DL, MVT::i32));
        // Do not add new nodes to DAG combiner worklist.
        DCI.CombineTo(N, Res, false);
        return SDValue();
@@ -8513,9 +8520,9 @@ static SDValue PerformORCombine(SDNode *N,
        // 2b
        unsigned lsb = countTrailingZeros(Mask);
        Res = DAG.getNode(ISD::SRL, DL, VT, N00,
-                        DAG.getConstant(lsb, MVT::i32));
+                        DAG.getConstant(lsb, DL, MVT::i32));
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
-                        DAG.getConstant(Mask2, MVT::i32));
+                        DAG.getConstant(Mask2, DL, MVT::i32));
        // Do not add new nodes to DAG combiner worklist.
        DCI.CombineTo(N, Res, false);
        return SDValue();
@@ -8534,7 +8541,7 @@ static SDValue PerformORCombine(SDNode *N,
        return SDValue();
  
      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
-                      DAG.getConstant(~Mask, MVT::i32));
+                      DAG.getConstant(~Mask, DL, MVT::i32));
  
      // Do not add new nodes to DAG combiner worklist.
      DCI.CombineTo(N, Res, false);
@@ -8574,7 +8581,10 @@ static SDValue PerformBFICombine(SDNode *N,
      unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
      unsigned LSB = countTrailingZeros(~InvMask);
      unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
-    unsigned Mask = (1 << Width)-1;
+    assert(Width <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
+    unsigned Mask = (1u << Width) - 1;
      unsigned Mask2 = N11C->getZExtValue();
      if ((Mask & (~Mask2)) == 0)
        return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
@@ -8612,7 +8622,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
                                   LD->getAlignment());
  
      SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
-                                    DAG.getConstant(4, MVT::i32));
+                                    DAG.getConstant(4, DL, MVT::i32));
      SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
                                   LD->getPointerInfo(), LD->isVolatile(),
                                   LD->isNonTemporal(), LD->isInvariant(),
@@ -8646,147 +8656,6 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
    return SDValue();
  }
  
-/// PerformSTORECombine - Target-specific dag combine xforms for
-/// ISD::STORE.
-static SDValue PerformSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  if (St->isVolatile())
-    return SDValue();
-
-  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
-  // pack all of the elements in one place.  Next, store to memory in fewer
-  // chunks.
-  SDValue StVal = St->getValue();
-  EVT VT = StVal.getValueType();
-  if (St->isTruncatingStore() && VT.isVector()) {
-    SelectionDAG &DAG = DCI.DAG;
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    EVT StVT = St->getMemoryVT();
-    unsigned NumElems = VT.getVectorNumElements();
-    assert(StVT != VT && "Cannot truncate to the same type");
-    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
-    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
-
-    // From, To sizes and ElemCount must be pow of two
-    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
-
-    // We are going to use the original vector elt for storing.
-    // Accumulated smaller vector elements must be a multiple of the store size.
-    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
-
-    unsigned SizeRatio  = FromEltSz / ToEltSz;
-    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
-
-    // Create a type on which we perform the shuffle.
-    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
-                                     NumElems*SizeRatio);
-    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
-    SDLoc DL(St);
-    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
-    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; ++i)
-      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
-
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
-
-    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
-                                DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
-    // At this point all of the data is stored at the bottom of the
-    // register. We now need to save it to mem.
-
-    // Find the largest store unit
-    MVT StoreType = MVT::i8;
-    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
-         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
-      MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
-        StoreType = Tp;
-    }
-    // Didn't find a legal store type.
-    if (!TLI.isTypeLegal(StoreType))
-      return SDValue();
-
-    // Bitcast the original vector into a vector of store-size units
-    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
-    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
-    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
-    SmallVector<SDValue, 8> Chains;
-    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
-                                        TLI.getPointerTy());
-    SDValue BasePtr = St->getBasePtr();
-
-    // Perform one or more big stores into memory.
-    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
-    for (unsigned I = 0; I < E; I++) {
-      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                                   StoreType, ShuffWide,
-                                   DAG.getIntPtrConstant(I));
-      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
-                                St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
-      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
-                            Increment);
-      Chains.push_back(Ch);
-    }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  }
-
-  if (!ISD::isNormalStore(St))
-    return SDValue();
-
-  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
-  // ARM stores of arguments in the same cache line.
-  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
-      StVal.getNode()->hasOneUse()) {
-    SelectionDAG  &DAG = DCI.DAG;
-    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
-    SDLoc DL(St);
-    SDValue BasePtr = St->getBasePtr();
-    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
-                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
-                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
-                                  St->isNonTemporal(), St->getAlignment());
-
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
-                                    DAG.getConstant(4, MVT::i32));
-    return DAG.getStore(NewST1.getValue(0), DL,
-                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
-                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
-                        St->isNonTemporal(),
-                        std::min(4U, St->getAlignment() / 2));
-  }
-
-  if (StVal.getValueType() != MVT::i64 ||
-      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return SDValue();
-
-  // Bitcast an i64 store extracted from a vector to f64.
-  // Otherwise, the i64 value will be legalized to a pair of i32 values.
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc dl(StVal);
-  SDValue IntVec = StVal.getOperand(0);
-  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
-                                 IntVec.getValueType().getVectorNumElements());
-  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
-  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                               Vec, StVal.getOperand(1));
-  dl = SDLoc(N);
-  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
-  // Make the DAGCombiner fold the bitcasts.
-  DCI.AddToWorklist(Vec.getNode());
-  DCI.AddToWorklist(ExtElt.getNode());
-  DCI.AddToWorklist(V.getNode());
-  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
-                      St->getPointerInfo(), St->isVolatile(),
-                      St->isNonTemporal(), St->getAlignment(),
-                      St->getAAInfo());
-}
-
  /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
  /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
  /// i64 vector to have f64 elements, since the value can then be loaded
@@ -8919,7 +8788,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
        // Make the DAGCombiner fold the bitcasts.
        DCI.AddToWorklist(V.getNode());
      }
-    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
    }
    Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
@@ -9007,18 +8876,21 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
                                DAG.getUNDEF(VT), NewMask.data());
  }
  
-/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
-/// NEON load/store intrinsics to merge base address updates.
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
+/// NEON load/store intrinsics, and generic vector load/stores, to merge
+/// base address updates.
+/// For generic load/stores, the memory type is assumed to be a vector.
+/// The caller is assumed to have checked legality.
  static SDValue CombineBaseUpdate(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
    SelectionDAG &DAG = DCI.DAG;
-  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
-                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
-  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                            N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  const bool isStore = N->getOpcode() == ISD::STORE;
+  const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
    SDValue Addr = N->getOperand(AddrOpIdx);
+  MemSDNode *MemN = cast<MemSDNode>(N);
+  SDLoc dl(N);
  
    // Search for a use of the address operand that is an increment.
    for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
@@ -9034,7 +8906,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
        continue;
  
      // Find the new opcode for the updating load/store.
-    bool isLoad = true;
+    bool isLoadOp = true;
      bool isLaneOp = false;
      unsigned NewOpc = 0;
      unsigned NumVecs = 0;
@@ -9057,19 +8929,19 @@ static SDValue CombineBaseUpdate(SDNode *N,
        case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
          NumVecs = 4; isLaneOp = true; break;
        case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
-        NumVecs = 1; isLoad = false; break;
+        NumVecs = 1; isLoadOp = false; break;
        case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
-        NumVecs = 2; isLoad = false; break;
+        NumVecs = 2; isLoadOp = false; break;
        case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
-        NumVecs = 3; isLoad = false; break;
+        NumVecs = 3; isLoadOp = false; break;
        case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
-        NumVecs = 4; isLoad = false; break;
+        NumVecs = 4; isLoadOp = false; break;
        case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
-        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+        NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
        case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
-        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+        NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
        case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
-        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+        NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
        }
      } else {
        isLaneOp = true;
@@ -9078,15 +8950,24 @@ static SDValue CombineBaseUpdate(SDNode *N,
        case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
        case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
        case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; isLaneOp = false; break;
+      case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
        }
      }
  
      // Find the size of memory referenced by the load/store.
      EVT VecTy;
-    if (isLoad)
+    if (isLoadOp) {
        VecTy = N->getValueType(0);
-    else
+    } else if (isIntrinsic) {
        VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+    } else {
+      assert(isStore && "Node has to be a load, a store, or an intrinsic!");
+      VecTy = N->getOperand(1).getValueType();
+    }
+
      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
      if (isLaneOp)
        NumBytes /= VecTy.getVectorNumElements();
@@ -9103,32 +8984,99 @@ static SDValue CombineBaseUpdate(SDNode *N,
        continue;
      }
  
+    // OK, we found an ADD we can fold into the base update.
+    // Now, create a _UPD node, taking care of not breaking alignment.
+
+    EVT AlignedVecTy = VecTy;
+    unsigned Alignment = MemN->getAlignment();
+
+    // If this is a less-than-standard-aligned load/store, change the type to
+    // match the standard alignment.
+    // The alignment is overlooked when selecting _UPD variants; and it's
+    // easier to introduce bitcasts here than fix that.
+    // There are 3 ways to get to this base-update combine:
+    // - intrinsics: they are assumed to be properly aligned (to the standard
+    //   alignment of the memory type), so we don't need to do anything.
+    // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+    //   intrinsics, so, likewise, there's nothing to do.
+    // - generic load/store instructions: the alignment is specified as an
+    //   explicit operand, rather than implicitly as the standard alignment
+    //   of the memory type (like the intrisics).  We need to change the
+    //   memory type to match the explicit alignment.  That way, we don't
+    //   generate non-standard-aligned ARMISD::VLDx nodes.
+    if (isa<LSBaseSDNode>(N)) {
+      if (Alignment == 0)
+        Alignment = 1;
+      if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+        MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+        assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+        assert(!isLaneOp && "Unexpected generic load/store lane.");
+        unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+        AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+      }
+      // Don't set an explicit alignment on regular load/stores that we want
+      // to transform to VLD/VST 1_UPD nodes.
+      // This matches the behavior of regular load/stores, which only get an
+      // explicit alignment if the MMO alignment is larger than the standard
+      // alignment of the memory type.
+      // Intrinsics, however, always get an explicit alignment, set to the
+      // alignment of the MMO.
+      Alignment = 1;
+    }
+
      // Create the new updating load/store node.
+    // First, create an SDVTList for the new updating node's results.
      EVT Tys[6];
-    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
      unsigned n;
      for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
+      Tys[n] = AlignedVecTy;
      Tys[n++] = MVT::i32;
      Tys[n] = MVT::Other;
      SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
+
+    // Then, gather the new node's operands.
      SmallVector<SDValue, 8> Ops;
      Ops.push_back(N->getOperand(0)); // incoming chain
      Ops.push_back(N->getOperand(AddrOpIdx));
      Ops.push_back(Inc);
-    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
-      Ops.push_back(N->getOperand(i));
+
+    if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+      // Try to match the intrinsic's signature
+      Ops.push_back(StN->getValue());
+    } else {
+      // Loads (and of course intrinsics) match the intrinsics' signature,
+      // so just add all but the alignment operand.
+      for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+        Ops.push_back(N->getOperand(i));
+    }
+
+    // For all node types, the alignment operand is always the last one.
+    Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+
+    // If this is a non-standard-aligned STORE, the penultimate operand is the
+    // stored value.  Bitcast it to the aligned type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+      SDValue &StVal = Ops[Ops.size()-2];
+      StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
      }
-    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops, MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
+
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
+                                           Ops, AlignedVecTy,
+                                           MemN->getMemOperand());
  
      // Update the uses.
-    std::vector<SDValue> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i) {
+    SmallVector<SDValue, 5> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i)
        NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+    // If this is an non-standard-aligned LOAD, the first result is the loaded
+    // value.  Bitcast it to the expected result type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+      SDValue &LdVal = NewResults[0];
+      LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
      }
+
      NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
      DCI.CombineTo(N, NewResults);
      DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
@@ -9138,6 +9086,14 @@ static SDValue CombineBaseUpdate(SDNode *N,
    return SDValue();
  }
  
+static SDValue PerformVLDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  return CombineBaseUpdate(N, DCI);
+}
+
  /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
  /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
  /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
@@ -9251,6 +9207,164 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
    return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
  }
  
+static SDValue PerformLOADCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+
+  // If this is a legal vector load, try to combine it into a VLD1_UPD.
+  if (ISD::isNormalLoad(N) && VT.isVector() &&
+      DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return CombineBaseUpdate(N, DCI);
+
+  return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
+  // pack all of the elements in one place.  Next, store to memory in fewer
+  // chunks.
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+  if (St->isTruncatingStore() && VT.isVector()) {
+    SelectionDAG &DAG = DCI.DAG;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    EVT StVT = St->getMemoryVT();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+    unsigned SizeRatio  = FromEltSz / ToEltSz;
+    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle.
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                     NumElems*SizeRatio);
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDLoc DL(St);
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; ++i)
+      ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (MVT Tp : MVT::integer_valuetypes()) {
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+        StoreType = Tp;
+    }
+    // Didn't find a legal store type.
+    if (!TLI.isTypeLegal(StoreType))
+      return SDValue();
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, DL,
+                                        TLI.getPointerTy());
+    SDValue BasePtr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+    for (unsigned I = 0; I < E; I++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(I, DL));
+      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                            Increment);
+      Chains.push_back(Ch);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (!ISD::isNormalStore(St))
+    return SDValue();
+
+  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+  // ARM stores of arguments in the same cache line.
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
+    SDLoc DL(St);
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
+                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, DL, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL,
+                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() == MVT::i64 &&
+      StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+    // Bitcast an i64 store extracted from a vector to f64.
+    // Otherwise, the i64 value will be legalized to a pair of i32 values.
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc dl(StVal);
+    SDValue IntVec = StVal.getOperand(0);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                   IntVec.getValueType().getVectorNumElements());
+    SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+    SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                                 Vec, StVal.getOperand(1));
+    dl = SDLoc(N);
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+    // Make the DAGCombiner fold the bitcasts.
+    DCI.AddToWorklist(Vec.getNode());
+    DCI.AddToWorklist(ExtElt.getNode());
+    DCI.AddToWorklist(V.getNode());
+    return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+                        St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(), St->getAlignment(),
+                        St->getAAInfo());
+  }
+
+  // If this is a legal vector store, try to combine it into a VST1_UPD.
+  if (ISD::isNormalStore(N) && VT.isVector() &&
+      DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return CombineBaseUpdate(N, DCI);
+
+  return SDValue();
+}
+
  // isConstVecPow2 - Return true if each vector element is a power of 2, all
  // elements are the same constant, C, and Log2(C) ranges from 1 to 32.
  static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
@@ -9307,23 +9421,27 @@ static SDValue PerformVCVTCombine(SDNode *N,
  
    MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
    MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
-  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 ||
+      NumLanes > 4) {
      // These instructions only exist converting from f32 to i32. We can handle
      // smaller integers by generating an extra truncate, but larger ones would
-    // be lossy.
+    // be lossy. We also can't handle more then 4 lanes, since these intructions
+    // only support v2i32/v4i32 types.
      return SDValue();
    }
  
+  SDLoc dl(N);
    unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
      Intrinsic::arm_neon_vcvtfp2fxu;
-  unsigned NumLanes = Op.getValueType().getVectorNumElements();
-  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
                                   NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
-                                 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
-                                 DAG.getConstant(Log2_64(C), MVT::i32));
+                                 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
+                                 N0,
+                                 DAG.getConstant(Log2_64(C), dl, MVT::i32));
  
    if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
-    FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
+    FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
  
    return FixConv;
  }
@@ -9365,19 +9483,20 @@ static SDValue PerformVDIVCombine(SDNode *N,
      return SDValue();
    }
  
+  SDLoc dl(N);
    SDValue ConvInput = Op.getOperand(0);
    unsigned NumLanes = Op.getValueType().getVectorNumElements();
    if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
      ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
-                            SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                            dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
                              ConvInput);
  
    unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
      Intrinsic::arm_neon_vcvtfxu2fp;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
                       Op.getValueType(),
-                     DAG.getConstant(IntrinsicOpcode, MVT::i32),
-                     ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
+                     DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
+                     ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32));
  }
  
  /// Getvshiftimm - Check if this is a valid build_vector for the immediate
@@ -9538,8 +9657,9 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
        VShiftOpc = ARMISD::VQRSHRNsu; break;
      }
  
-    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+    SDLoc dl(N);
+    return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
    }
  
    case Intrinsic::arm_neon_vshiftins: {
@@ -9555,9 +9675,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
        llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
      }
  
-    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
+    SDLoc dl(N);
+    return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
                         N->getOperand(1), N->getOperand(2),
-                       DAG.getConstant(Cnt, MVT::i32));
+                       DAG.getConstant(Cnt, dl, MVT::i32));
    }
  
    case Intrinsic::arm_neon_vqrshifts:
@@ -9602,9 +9723,11 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
    default: llvm_unreachable("unexpected shift opcode");
  
    case ISD::SHL:
-    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
-      return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
+    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
+      SDLoc dl(N);
+      return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, dl, MVT::i32));
+    }
      break;
  
    case ISD::SRA:
@@ -9612,8 +9735,9 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
      if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
        unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
                              ARMISD::VSHRs : ARMISD::VSHRu);
-      return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
+      SDLoc dl(N);
+      return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, dl, MVT::i32));
      }
    }
    return SDValue();
@@ -9839,10 +9963,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
    case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
    case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ISD::LOAD:       return PerformLOADCombine(N, DCI);
    case ARMISD::VLD2DUP:
    case ARMISD::VLD3DUP:
    case ARMISD::VLD4DUP:
-    return CombineBaseUpdate(N, DCI);
+    return PerformVLDCombine(N, DCI);
    case ARMISD::BUILD_VECTOR:
      return PerformARMBUILD_VECTORCombine(N, DCI);
    case ISD::INTRINSIC_VOID:
@@ -9862,7 +9987,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
      case Intrinsic::arm_neon_vst2lane:
      case Intrinsic::arm_neon_vst3lane:
      case Intrinsic::arm_neon_vst4lane:
-      return CombineBaseUpdate(N, DCI);
+      return PerformVLDCombine(N, DCI);
      default: break;
      }
      break;
@@ -9925,10 +10050,8 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
    const Function *F = MF.getFunction();
  
    // See if we can use NEON instructions for this...
-  if ((!IsMemset || ZeroMemset) &&
-      Subtarget->hasNEON() &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat)) {
+  if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
      bool Fast;
      if (Size >= 16 &&
          (memOpAlign(SrcAlign, DstAlign, 16) ||
@@ -9973,6 +10096,28 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
    return false;
  }
  
+bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+  EVT VT = ExtVal.getValueType();
+
+  if (!isTypeLegal(VT))
+    return false;
+
+  // Don't create a loadext if we can fold the extension into a wide/long
+  // instruction.
+  // If there's more than one user instruction, the loadext is desirable no
+  // matter what.  There can be two uses by the same instruction.
+  if (ExtVal->use_empty() ||
+      !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
+    return true;
+
+  SDNode *U = *ExtVal->use_begin();
+  if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
+       U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
+    return false;
+
+  return true;
+}
+
  bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
    if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
      return false;
@@ -10121,7 +10266,8 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
  /// isLegalAddressingMode - Return true if the addressing mode represented
  /// by AM is legal for this target, for a load/store of the specified type.
  bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty) const {
+                                              Type *Ty,
+                                              unsigned AS) const {
    EVT VT = getValueType(Ty, true);
    if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
      return false;
@@ -10186,9 +10332,9 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
  bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
    // Thumb2 and ARM modes can use cmn for negative immediates.
    if (!Subtarget->isThumb())
-    return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
+    return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
    if (Subtarget->isThumb2())
-    return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
+    return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
    // Thumb1 doesn't have cmn, and only 8-bit immediates.
    return Imm >= 0 && Imm <= 255;
  }
@@ -10199,7 +10345,7 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
  /// immediate into a register.
  bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
    // Same encoding for add/sub, just flip the sign.
-  int64_t AbsImm = llvm::abs64(Imm);
+  int64_t AbsImm = std::abs(Imm);
    if (!Subtarget->isThumb())
      return ARM_AM::getSOImmVal(AbsImm) != -1;
    if (Subtarget->isThumb2())
@@ -10223,7 +10369,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
        if (RHSC < 0 && RHSC > -256) {
          assert(Ptr->getOpcode() == ISD::ADD);
          isInc = false;
-        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
          return true;
        }
      }
@@ -10237,7 +10383,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
        if (RHSC < 0 && RHSC > -0x1000) {
          assert(Ptr->getOpcode() == ISD::ADD);
          isInc = false;
-        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
          Base = Ptr->getOperand(0);
          return true;
        }
@@ -10280,11 +10426,11 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
      if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
        assert(Ptr->getOpcode() == ISD::ADD);
        isInc = false;
-      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+      Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
        return true;
      } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
        isInc = Ptr->getOpcode() == ISD::ADD;
-      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
+      Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
        return true;
      }
    }
@@ -10526,7 +10672,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
  
  typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
  RCPair
-ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ARMTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                const std::string &Constraint,
                                                  MVT VT) const {
    if (Constraint.size() == 1) {
      // GCC ARM Constraint Letters
@@ -10540,6 +10687,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
          return RCPair(0U, &ARM::hGPRRegClass);
        break;
      case 'r':
+      if (Subtarget->isThumb1Only())
+        return RCPair(0U, &ARM::tGPRRegClass);
        return RCPair(0U, &ARM::GPRRegClass);
      case 'w':
        if (VT == MVT::Other)
@@ -10570,7 +10719,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
    if (StringRef("{cc}").equals_lower(Constraint))
      return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
  
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
  }
  
  /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -10729,7 +10878,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
          }
          return;
      }
-    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
      break;
    }
  
@@ -10775,7 +10924,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
    SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                           getPointerTy());
  
-  Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
+  Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
  
    SDLoc dl(Op);
    TargetLowering::CallLoweringInfo CLI(DAG);
@@ -10797,7 +10946,7 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
    SDValue Size  = Op.getOperand(1);
  
    SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
-                              DAG.getConstant(2, MVT::i32));
+                              DAG.getConstant(2, DL, MVT::i32));
  
    SDValue Flag;
    Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
@@ -10850,11 +10999,7 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
  
    // there can be 1's on either or both "outsides", all the "inside"
    // bits must be 0's
-  unsigned TO = CountTrailingOnes_32(v);
-  unsigned LO = CountLeadingOnes_32(v);
-  v = (v >> TO) << TO;
-  v = (v << LO) >> LO;
-  return v == 0;
+  return isShiftedMask_32(~v);
  }
  
  /// isFPImmLegal - Returns true if the target can instruction select the
@@ -11024,11 +11169,11 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
  }
  
  // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-void ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                           AtomicOrdering Ord, bool IsStore,
                                           bool IsLoad) const {
    if (!getInsertFencesForAtomic())
-    return;
+    return nullptr;
  
    switch (Ord) {
    case NotAtomic:
@@ -11036,27 +11181,27 @@ void ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
      llvm_unreachable("Invalid fence: unordered/non-atomic");
    case Monotonic:
    case Acquire:
-    return; // Nothing to do
+    return nullptr; // Nothing to do
    case SequentiallyConsistent:
      if (!IsStore)
-      return; // Nothing to do
-              /*FALLTHROUGH*/
+      return nullptr; // Nothing to do
+    /*FALLTHROUGH*/
    case Release:
    case AcquireRelease:
      if (Subtarget->isSwift())
-      makeDMB(Builder, ARM_MB::ISHST);
+      return makeDMB(Builder, ARM_MB::ISHST);
      // FIXME: add a comment with a link to documentation justifying this.
      else
-      makeDMB(Builder, ARM_MB::ISH);
-    return;
+      return makeDMB(Builder, ARM_MB::ISH);
    }
+  llvm_unreachable("Unknown fence ordering in emitLeadingFence");
  }
  
-void ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                            AtomicOrdering Ord, bool IsStore,
                                            bool IsLoad) const {
    if (!getInsertFencesForAtomic())
-    return;
+    return nullptr;
  
    switch (Ord) {
    case NotAtomic:
@@ -11064,13 +11209,13 @@ void ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
      llvm_unreachable("Invalid fence: unordered/not-atomic");
    case Monotonic:
    case Release:
-    return; // Nothing to do
+    return nullptr; // Nothing to do
    case Acquire:
    case AcquireRelease:
-    case SequentiallyConsistent:
-    makeDMB(Builder, ARM_MB::ISH);
-    return;
+  case SequentiallyConsistent:
+    return makeDMB(Builder, ARM_MB::ISH);
    }
+  llvm_unreachable("Unknown fence ordering in emitTrailingFence");
  }
  
  // Loads and stores less than 64-bits are already atomic; ones above that
@@ -11086,6 +11231,9 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  // are doomed anyway, so defer to the default libcall and blame the OS when
  // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
  // anything for those.
+// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
+// guarantee, see DDI0406C ARM architecture reference manual,
+// sections A8.8.72-74 LDRD)
  bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
    unsigned Size = LI->getType()->getPrimitiveSizeInBits();
    return (Size == 64) && !Subtarget->isMClass();
@@ -11093,14 +11241,46 @@ bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
  
  // For the real atomic operations, we have ldrex/strex up to 32 bits,
  // and up to 64 bits on the non-M profiles
-bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+TargetLoweringBase::AtomicRMWExpansionKind
+ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
    unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return Size <= (Subtarget->isMClass() ? 32U : 64U);
+  return (Size <= (Subtarget->isMClass() ? 32U : 64U))
+             ? AtomicRMWExpansionKind::LLSC
+             : AtomicRMWExpansionKind::None;
  }
  
  // This has so far only been implemented for MachO.
  bool ARMTargetLowering::useLoadStackGuardNode() const {
-  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO;
+  return Subtarget->isTargetMachO();
+}
+
+bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                                  unsigned &Cost) const {
+  // If we do not have NEON, vector types are not natively supported.
+  if (!Subtarget->hasNEON())
+    return false;
+
+  // Floating point values and vector values map to the same register file.
+  // Therefore, althought we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  // We can do a store + vector extract on any vector that fits perfectly in a D
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;
  }
  
  Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
@@ -11159,17 +11339,17 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
      if (!Subtarget->isLittle())
        std::swap (Lo, Hi);
      Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
+    return Builder.CreateCall(Strex, {Lo, Hi, Addr});
    }
  
    Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
    Type *Tys[] = { Addr->getType() };
    Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
  
-  return Builder.CreateCall2(
-      Strex, Builder.CreateZExtOrBitCast(
-                 Val, Strex->getFunctionType()->getParamType(0)),
-      Addr);
+  return Builder.CreateCall(
+      Strex, {Builder.CreateZExtOrBitCast(
+                  Val, Strex->getFunctionType()->getParamType(0)),
+              Addr});
  }
  
  enum HABaseType {
@@ -11231,7 +11411,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
    return (Members > 0 && Members <= 4);
  }
  
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
+/// passing according to AAPCS rules.
  bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
      Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
    if (getEffectiveCallingConv(CallConv, isVarArg) !=
@@ -11240,7 +11422,9 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
  
    HABaseType Base = HA_UNKNOWN;
    uint64_t Members = 0;
-  bool result = isHomogeneousAggregate(Ty, Base, Members);
-  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
-  return result;
+  bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+
+  bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
+  return IsHA || IsIntArray;
  }