Lower thumbv4t & thumbv5 lo->lo copies through a push-pop sequence

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index f0fab69b66b3be48131d86cfa873735b7fac7903..24992c7d6f2fa23ff35e4cc4727fd7ed928fa8e0 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -70,9 +70,9 @@ namespace {
    class ARMCCState : public CCState {
    public:
      ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
-               LLVMContext &C, ParmContext PC)
-        : CCState(CC, isVarArg, MF, TM, locs, C) {
+               SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+               ParmContext PC)
+        : CCState(CC, isVarArg, MF, locs, C) {
        assert(((PC == Call) || (PC == Prologue)) &&
               "ARMCCState users must specify whether their context is call"
               "or prologue generation.");
@@ -166,8 +166,8 @@ static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
  ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
    Subtarget = &TM.getSubtarget<ARMSubtarget>();
-  RegInfo = TM.getRegisterInfo();
-  Itins = TM.getInstrItineraryData();
+  RegInfo = TM.getSubtargetImpl()->getRegisterInfo();
+  Itins = TM.getSubtargetImpl()->getInstrItineraryData();
  
    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  
@@ -267,10 +267,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
        { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: double __aeabi_drsub(double x, double y) (rsub)
        { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  
        // Double-precision floating-point comparison helper functions
        // RTABI chapter 4.1.2, Table 3
+      // FIXME: void __aeabi_cdcmpeq(double, double)
+      // FIXME: void __aeabi_cdcmple(double, double)
+      // FIXME: void __aeabi_cdrcmple(double, double)
        { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
        { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
        { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
@@ -285,10 +289,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
        { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: void __aeabi_frsub(float x, float y)
        { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  
        // Single-precision floating-point comparison helper functions
        // RTABI chapter 4.1.2, Table 5
+      // FIXME: void __aeabi_cfcmpeq(float, float)
+      // FIXME: void __aeabi_cfcmple(float, float)
+      // FIXME: void __aeabi_cfrcmple(float, float)
        { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
        { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
        { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
@@ -313,6 +321,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
        // RTABI chapter 4.1.2, Table 7
        { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: float __aeabi_f2f(short)
+      // FIXME: float __aeabi_h2f_alt(short)
+      // FIXME: short __aeabi_f2h(float)
+      // FIXME: short __aeabi_f2h_alt(float)
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: short __aeabi_d2h_alt(double)
  
        // Integer to floating-point conversions.
        // RTABI chapter 4.1.2, Table 8
@@ -327,27 +341,48 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  
        // Long long helper functions
        // RTABI chapter 4.2, Table 9
-      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_I64,     "__aeabi_lmul",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: __aeabi_ldivmod is SDIVREM not SDIV; we should custom lower this
+      { RTLIB::SDIV_I64,    "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIVREM_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: __aeabi_uldivmod is UDIVREM not UDIV; we should custom lower this
+      { RTLIB::UDIV_I64,    "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SHL_I64,     "__aeabi_llsl",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRL_I64,     "__aeabi_llsr",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRA_I64,     "__aeabi_lasr",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: int __aeabi_lcmp(long long, long long)
+      // FIXME: int __aeabi_ulcmp(unsigned long long, unsigned long long)
  
        // Integer division functions
        // RTABI chapter 4.3.1
-      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I8,     "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I16,    "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I32,    "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I8,     "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I16,    "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I32,    "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIVREM_I8,  "__aeabi_idivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIVREM_I16, "__aeabi_idivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIVREM_I32, "__aeabi_idivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIVREM_I8,  "__aeabi_uidivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
  
        // Memory operations
        // RTABI chapter 4.3.4
+      // FIXME: void __aeabi_memcpy8(void *, const void *, size_t)
+      // FIXME: void __aeabi_memcpy4(void *, const void *, size_t)
        { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: void __aeabi_memmove8(void *, const void *, size_t)
+      // FIXME: void __aeabi_memmove4(void *, const void *, size_t)
        { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: void __aeabi_memset8(void *, size_t, int)
+      // FIXME: void __aeabi_memset4(void *, size_t, int)
        { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      // FIXME: void __aeabi_memclr8(void *, size_t)
+      // FIXME: void __aeabi_memclr4(void *, size_t)
+      // FIXME: void __aeabi_memclr(void *, size_t)
      };
  
      for (const auto &LC : LibraryCalls) {
@@ -356,6 +391,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
        if (LC.Cond != ISD::SETCC_INVALID)
          setCmpLibcallCC(LC.Op, LC.Cond);
      }
+
+    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
    }
  
    if (Subtarget->isTargetWindows()) {
@@ -387,6 +425,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
    }
  
+  // The half <-> float conversion functions are always soft-float, but are
+  // needed for some targets which use a hard-float calling convention by
+  // default.
+  if (Subtarget->isAAPCS_ABI()) {
+    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
+  } else {
+    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
+    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
+    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+  }
+
    if (Subtarget->isThumb1Only())
      addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
    else
@@ -396,8 +447,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      addRegisterClass(MVT::f32, &ARM::SPRRegClass);
      if (!Subtarget->isFPOnlySP())
        addRegisterClass(MVT::f64, &ARM::DPRRegClass);
-
-    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    }
  
    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -582,8 +631,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  
    computeRegisterProperties();
  
-  // ARM does not have f32 extending load.
+  // ARM does not have floating-point extending loads.
    setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+
+  // ... or truncating stores
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  
    // ARM does not have i1 sign extending load.
    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -658,31 +713,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    }
  
    // FIXME: Also set divmod for SREM on EABI
-  setOperationAction(ISD::SREM,  MVT::i32, Expand);
-  setOperationAction(ISD::UREM,  MVT::i32, Expand);
-  // Register based DivRem for AEABI (RTABI 4.2)
-  if (Subtarget->isTargetAEABI()) {
-    setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
-    setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
-    setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
-    setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
-    setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
-    setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
-    setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
-    setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
-
-    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
-
-    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
-    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
-  } else {
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  if (!Subtarget->isTargetAEABI()) {
      setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
      setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
    }
@@ -725,7 +758,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      // On v8, we have particularly efficient implementations of atomic fences
      // if they can be combined with nearby atomic loads and stores.
      if (!Subtarget->hasV8Ops()) {
-      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+      // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
        setInsertFencesForAtomic(true);
      }
    } else {
@@ -825,10 +858,17 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
        setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
        setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
      }
-    // Special handling for half-precision FP.
+
+    // v8 adds f64 <-> f16 conversion. Before that it should be expanded.
+    if (!Subtarget->hasV8Ops()) {
+      setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+    }
+
+    // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
      if (!Subtarget->hasFP16()) {
-      setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
-      setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
+      setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
      }
    }
  
@@ -844,6 +884,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      }
    }
  
+  // ARMv8 implements a lot of rounding-like FP operations.
+  if (Subtarget->hasV8Ops()) {
+    static MVT RoundingTypes[] = {MVT::f32, MVT::f64};
+    for (const auto Ty : RoundingTypes) {
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+    }
+  }
    // We have target-specific dag combine patterns for the following nodes:
    // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
    setTargetDAGCombine(ISD::ADD);
@@ -1119,7 +1171,8 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
  
    // Load are scheduled for latency even if there instruction itinerary
    // is not available.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
  
    if (MCID.getNumDefs() == 0)
@@ -1256,8 +1309,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
  
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RVLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext(), Call);
    CCInfo.AnalyzeCallResult(Ins,
                             CCAssignFnForNode(CallConv, /* Return*/ true,
                                               isVarArg));
@@ -1417,8 +1470,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                    *DAG.getContext(), Call);
    CCInfo.AnalyzeCallOperands(Outs,
                               CCAssignFnForNode(CallConv, /* Return*/ false,
                                                 isVarArg));
@@ -1646,14 +1699,30 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
      bool isStub = (isExt && Subtarget->isTargetMachO()) &&
                     getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || isStub;
+    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
      // ARM call to a local ARM function is predicable.
      isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
      // tBX takes a register source operand.
      if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
        assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
        Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
-                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy()));
+                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
+                                                      0, ARMII::MO_NONLAZY));
+      Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+                           MachinePointerInfo::getGOT(), false, false, true, 0);
+    } else if (Subtarget->isTargetCOFF()) {
+      assert(Subtarget->isTargetWindows() &&
+             "Windows is the only supported COFF target");
+      unsigned TargetFlags = GV->hasDLLImportStorageClass()
+                                 ? ARMII::MO_DLLIMPORT
+                                 : ARMII::MO_NO_FLAG;
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0,
+                                          TargetFlags);
+      if (GV->hasDLLImportStorageClass())
+        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                             DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(),
+                                         Callee), MachinePointerInfo::getGOT(),
+                             false, false, false, 0);
      } else {
        // On ELF targets for PIC code, direct calls should go through the PLT
        unsigned OpFlags = 0;
@@ -1666,7 +1735,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
      isDirect = true;
      bool isStub = Subtarget->isTargetMachO() &&
                    getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || isStub;
+    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
      // tBX takes a register source operand.
      const char *Sym = S->getSymbol();
      if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
@@ -1727,7 +1796,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    // Add a register mask operand representing the call-preserved registers.
    if (!isTailCall) {
      const uint32_t *Mask;
-    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const TargetRegisterInfo *TRI =
+        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
      const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
      if (isThisReturn) {
        // For 'this' returns, use the R0-preserving mask if applicable
@@ -1927,17 +1997,30 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (Subtarget->isThumb1Only())
      return false;
  
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on ARM when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->hasExternalWeakLinkage())
+      return false;
+  }
+
    // If the calling conventions do not match, then we'd better make sure the
    // results are returned in the same way as what the caller expects.
    if (!CCMatch) {
      SmallVector<CCValAssign, 16> RVLocs1;
-    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
+    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                       *DAG.getContext(), Call);
      CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
  
      SmallVector<CCValAssign, 16> RVLocs2;
-    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
+    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                       *DAG.getContext(), Call);
      CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
  
      if (RVLocs1.size() != RVLocs2.size())
@@ -1971,8 +2054,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      // Check if stack adjustment is needed. For now, do not do this if any
      // argument is passed on the stack.
      SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                      *DAG.getContext(), Call);
      CCInfo.AnalyzeCallOperands(Outs,
                                 CCAssignFnForNode(CalleeCC, false, isVarArg));
      if (CCInfo.getNextStackOffset()) {
@@ -1982,7 +2065,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        // the caller's fixed stack objects.
        MachineFrameInfo *MFI = MF.getFrameInfo();
        const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+      const TargetInstrInfo *TII =
+          getTargetMachine().getSubtargetImpl()->getInstrInfo();
        for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
             i != e;
             ++i, ++realArgIdx) {
@@ -2025,7 +2109,7 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    LLVMContext &Context) const {
    SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
    return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
                                                      isVarArg));
  }
@@ -2073,8 +2157,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
    SmallVector<CCValAssign, 16> RVLocs;
  
    // CCState - Info about the registers and stack slots.
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext(), Call);
  
    // Analyze outgoing return values.
    CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
@@ -2085,6 +2169,10 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
    RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
    bool isLittleEndian = Subtarget->isLittle();
  
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  AFI->setReturnRegsCount(RVLocs.size());
+
    // Copy the result values into the output registers.
    for (unsigned i = 0, realRVLocIdx = 0;
         i != RVLocs.size();
@@ -2489,15 +2577,23 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
           "Windows on ARM expects to use movw/movt");
  
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const ARMII::TOF TargetFlags =
+    (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
    EVT PtrVT = getPointerTy();
+  SDValue Result;
    SDLoc DL(Op);
  
    ++NumMovwMovt;
  
    // FIXME: Once remat is capable of dealing with instructions with register
    // operands, expand this into two nodes.
-  return DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
-                     DAG.getTargetGlobalAddress(GV, DL, PtrVT));
+  Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+                                                  TargetFlags));
+  if (GV->hasDLLImportStorageClass())
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+  return Result;
  }
  
  SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
@@ -2546,9 +2642,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
    switch (IntNo) {
    default: return SDValue();    // Don't custom lower most intrinsics.
    case Intrinsic::arm_rbit: {
-    assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+    assert(Op.getOperand(1).getValueType() == MVT::i32 &&
             "RBIT intrinsic must have i32 type!");
-    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(0));
+    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
    }
    case Intrinsic::arm_thread_pointer: {
      EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -2718,7 +2814,10 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
      NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
    }
  
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = MF.getTarget()
+                       .getSubtargetImpl()
+                       ->getFrameLowering()
+                       ->getStackAlignment();
    ArgRegsSize = NumGPRs * 4;
  
    // If parameter is split between stack and GPRs...
@@ -2895,8 +2994,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                    *DAG.getContext(), Prologue);
    CCInfo.AnalyzeFormalArguments(Ins,
                                  CCAssignFnForNode(CallConv, /* Return*/ false,
                                                    isVarArg));
@@ -5712,7 +5811,7 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
    // operation legalization where we can't create illegal types.
    return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                          LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
-                        LD->getMemoryVT(), LD->isVolatile(),
+                        LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(),
                          LD->isNonTemporal(), LD->getAlignment());
  }
  
@@ -6273,7 +6372,8 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
  void ARMTargetLowering::
  SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
                         MachineBasicBlock *DispatchBB, int FI) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    DebugLoc dl = MI->getDebugLoc();
    MachineFunction *MF = MBB->getParent();
    MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6388,7 +6488,8 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
  
  MachineBasicBlock *ARMTargetLowering::
  EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    DebugLoc dl = MI->getDebugLoc();
    MachineFunction *MF = MBB->getParent();
    MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6905,7 +7006,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
    // This pseudo instruction has 3 operands: dst, src, size
    // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
    // Otherwise, we will generate unrolled scalar copies.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
    MachineFunction::iterator It = BB;
    ++It;
@@ -7139,7 +7241,7 @@ MachineBasicBlock *
  ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
                                         MachineBasicBlock *MBB) const {
    const TargetMachine &TM = getTargetMachine();
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
    assert(Subtarget->isTargetWindows() &&
@@ -7195,8 +7297,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
  
    AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
                                        ARM::SP)
-                              .addReg(ARM::SP, RegState::Define)
-                              .addReg(ARM::R4, RegState::Kill)));
+                              .addReg(ARM::SP).addReg(ARM::R4)));
  
    MI->eraseFromParent();
    return MBB;
@@ -7205,7 +7306,8 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
  MachineBasicBlock *
  ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
    DebugLoc dl = MI->getDebugLoc();
    bool isThumb2 = Subtarget->isThumb2();
    switch (MI->getOpcode()) {
@@ -7475,8 +7577,8 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
    // Rename pseudo opcodes.
    unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
    if (NewOpc) {
-    const ARMBaseInstrInfo *TII =
-      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
+    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
+        getTargetMachine().getSubtargetImpl()->getInstrInfo());
      MCID = &TII->get(NewOpc);
  
      assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
@@ -8411,8 +8513,6 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
      if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
        std::swap (NewLD1, NewLD2);
      SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
-    DCI.RemoveFromWorklist(LD);
-    DAG.DeleteNode(LD);
      return Result;
    }
  
@@ -8575,7 +8675,7 @@ static SDValue PerformSTORECombine(SDNode *N,
    return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
                        St->getPointerInfo(), St->isVolatile(),
                        St->isNonTemporal(), St->getAlignment(),
-                      St->getTBAAInfo());
+                      St->getAAInfo());
  }
  
  /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
@@ -9665,8 +9765,10 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
    return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
  }
  
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned,
-                                                      bool *Fast) const {
+bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                       unsigned,
+                                                       unsigned,
+                                                       bool *Fast) const {
    // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
    bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
  
@@ -9720,11 +9822,12 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
      bool Fast;
      if (Size >= 16 &&
          (memOpAlign(SrcAlign, DstAlign, 16) ||
-         (allowsUnalignedMemoryAccesses(MVT::v2f64, 0, &Fast) && Fast))) {
+         (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
        return MVT::v2f64;
      } else if (Size >= 8 &&
                 (memOpAlign(SrcAlign, DstAlign, 8) ||
-                (allowsUnalignedMemoryAccesses(MVT::f64, 0, &Fast) && Fast))) {
+                (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
+                 Fast))) {
        return MVT::f64;
      }
    }
@@ -10531,7 +10634,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
    assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
    unsigned Opcode = Op->getOpcode();
    assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
-      "Invalid opcode for Div/Rem lowering");
+         "Invalid opcode for Div/Rem lowering");
    bool isSigned = (Opcode == ISD::SDIVREM);
    EVT VT = Op->getValueType(0);
    Type *Ty = VT.getTypeForEVT(*DAG.getContext());
@@ -10539,10 +10642,10 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
    RTLIB::Libcall LC;
    switch (VT.getSimpleVT().SimpleTy) {
    default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
-  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
-  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
-  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
    }
  
    SDValue InChain = DAG.getEntryNode();
@@ -10590,7 +10693,7 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
    Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
    Flag = Chain.getValue(1);
  
-  SDVTList NodeTys = DAG.getVTList(MVT::i32, MVT::Glue);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
  
    SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
@@ -10773,12 +10876,16 @@ bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
    return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit;
  }
  
+// This has so far only been implemented for MachO.
+bool ARMTargetLowering::useLoadStackGuardNode() const {
+  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO;
+}
+
  Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                           AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
    Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsAcquire = isAtLeastAcquire(Ord);
  
    // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
    // intrinsic must return {i32, i32} and we have to recombine them into a
@@ -10814,8 +10921,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                                 Value *Addr,
                                                 AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsRelease = isAtLeastRelease(Ord);
  
    // Since the intrinsics must have legal type, the i64 intrinsics take two
    // parameters: "i32, i32". We must marshal Val into the appropriate form
@@ -10913,6 +11019,6 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
    HABaseType Base = HA_UNKNOWN;
    uint64_t Members = 0;
    bool result = isHomogeneousAggregate(Ty, Base, Members);
-  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
+  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
    return result;
  }