Revert/correct some FastISel changes in r170104 (EVT->MVT for

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index c17e9ae381e9aa63c36fddf1c5f3aa7e65edb7f1..67a9998d0aa07445d3ab017863ddd47529f57524 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -23,14 +23,9 @@
  #include "ARMTargetMachine.h"
  #include "ARMTargetObjectFile.h"
  #include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
  #include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalValue.h"
-#include "llvm/Instruction.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Type.h"
  #include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -40,14 +35,19 @@
  #include "llvm/CodeGen/MachineModuleInfo.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
  #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Statistic.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Type.h"
  using namespace llvm;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
@@ -122,6 +122,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
    setOperationAction(ISD::SELECT,            VT, Expand);
    setOperationAction(ISD::SELECT_CC,         VT, Expand);
+  setOperationAction(ISD::VSELECT,           VT, Expand);
    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
    if (VT.isInteger()) {
      setOperationAction(ISD::SHL, VT, Custom);
@@ -514,6 +515,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
      setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
      setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
  
      // Neon does not support some operations on v1i64 and v2i64 types.
      setOperationAction(ISD::MUL, MVT::v1i64, Expand);
@@ -537,6 +543,17 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
      setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
  
+    setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
+    setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
+
+    // NEON does not have single instruction CTPOP for vectors with element
+    // types wider than 8-bits.  However, custom lowering can leverage the
+    // v8i8/v16i8 vcnt instruction.
+    setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+
      setTargetDAGCombine(ISD::INTRINSIC_VOID);
      setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
      setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -634,9 +651,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    if (!Subtarget->hasV6Ops())
      setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  
-  // These are expanded into libcalls.
-  if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) {
-    // v7M has a hardware divider
+  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
+      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+    // These are expanded into libcalls if the cpu doesn't have HW divider.
      setOperationAction(ISD::SDIV,  MVT::i32, Expand);
      setOperationAction(ISD::UDIV,  MVT::i32, Expand);
    }
@@ -686,7 +703,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
      setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
      setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_SWAP,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
      setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
      setInsertFencesForAtomic(true);
@@ -796,12 +817,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setTargetDAGCombine(ISD::ADD);
    setTargetDAGCombine(ISD::SUB);
    setTargetDAGCombine(ISD::MUL);
-
-  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) {
-    setTargetDAGCombine(ISD::AND);
-    setTargetDAGCombine(ISD::OR);
-    setTargetDAGCombine(ISD::XOR);
-  }
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
  
    if (Subtarget->hasV6Ops())
      setTargetDAGCombine(ISD::SRL);
@@ -815,9 +833,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setSchedulingPreference(Sched::Hybrid);
  
    //// temporary - rewrite interface to use type
-  maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
-  maxStoresPerMemset = 16;
+  maxStoresPerMemset = 8;
    maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
  
    // On ARM arguments smaller than 4 bytes are extended, so all arguments
    // are at least 4 bytes aligned.
@@ -826,7 +847,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    benefitFromCodePlacementOpt = true;
  
    // Prefer likely predicted branches to selects on out-of-order cores.
-  predictableSelectIsExpensive = Subtarget->isCortexA9();
+  predictableSelectIsExpensive = Subtarget->isLikeA9();
  
    setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
  }
@@ -1025,7 +1046,7 @@ EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
  
  /// getRegClassFor - Return the register class that should be used for the
  /// specified value type.
-const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
    // Map v4i64 to QQ registers but do not make the type legal. Similarly map
    // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
    // load / store 4 to 8 consecutive D registers.
@@ -1595,19 +1616,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // FIXME: handle tail calls differently.
    unsigned CallOpc;
+  bool HasMinSizeAttr = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::MinSize);
    if (Subtarget->isThumb()) {
      if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
        CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && !isARMFunc &&
-             Subtarget->hasRAS() && !Subtarget->isThumb1Only())
-      // "mov lr, pc; b _foo" to avoid confusing the RSP
-      CallOpc = ARMISD::CALL_NOLINK;
      else
        CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
    } else {
-    if (!isDirect && !Subtarget->hasV5TOps()) {
+    if (!isDirect && !Subtarget->hasV5TOps())
        CallOpc = ARMISD::CALL_NOLINK;
-    } else if (doesNotRet && isDirect && Subtarget->hasRAS())
+    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+               // Emit regular call when code size is the priority
+               !HasMinSizeAttr)
        // "mov lr, pc; b _foo" to avoid confusing the RSP
        CallOpc = ARMISD::CALL_NOLINK;
      else
@@ -1657,22 +1678,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  /// and then confiscate the rest of the parameter registers to insure
  /// this.
  void
-ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
+ARMTargetLowering::HandleByVal(
+    CCState *State, unsigned &size, unsigned Align) const {
    unsigned reg = State->AllocateReg(GPRArgRegs, 4);
    assert((State->getCallOrPrologue() == Prologue ||
            State->getCallOrPrologue() == Call) &&
           "unhandled ParmContext");
    if ((!State->isFirstByValRegValid()) &&
        (ARM::R0 <= reg) && (reg <= ARM::R3)) {
-    State->setFirstByValReg(reg);
-    // At a call site, a byval parameter that is split between
-    // registers and memory needs its size truncated here.  In a
-    // function prologue, such byval parameters are reassembled in
-    // memory, and are not truncated.
-    if (State->getCallOrPrologue() == Call) {
-      unsigned excess = 4 * (ARM::R4 - reg);
-      assert(size >= excess && "expected larger existing stack allocation");
-      size -= excess;
+    if (Subtarget->isAAPCS_ABI() && Align > 4) {
+      unsigned AlignInRegs = Align / 4;
+      unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
+      for (unsigned i = 0; i < Waste; ++i)
+        reg = State->AllocateReg(GPRArgRegs, 4);
+    }
+    if (reg != 0) {
+      State->setFirstByValReg(reg);
+      // At a call site, a byval parameter that is split between
+      // registers and memory needs its size truncated here.  In a
+      // function prologue, such byval parameters are reassembled in
+      // memory, and are not truncated.
+      if (State->getCallOrPrologue() == Call) {
+        unsigned excess = 4 * (ARM::R4 - reg);
+        assert(size >= excess && "expected larger existing stack allocation");
+        size -= excess;
+      }
      }
    }
    // Confiscate any remaining parameter registers to preclude their
@@ -1805,6 +1835,14 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      }
    }
  
+  // If Caller's vararg or byval argument has been split between registers and
+  // stack, do not perform tail call, since part of the argument is in caller's
+  // local frame.
+  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
+                                      getInfo<ARMFunctionInfo>();
+  if (AFI_Caller->getVarArgsRegSaveSize())
+    return false;
+
    // If the callee takes no arguments then go on to check the results of the
    // call.
    if (!Outs.empty()) {
@@ -1859,6 +1897,17 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    return true;
  }
  
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
+                                                    isVarArg));
+}
+
  SDValue
  ARMTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
@@ -2536,7 +2585,10 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
  void
  ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                          DebugLoc dl, SDValue &Chain,
-                                        unsigned ArgOffset) const {
+                                        const Value *OrigArg,
+                                        unsigned OffsetFromOrigArg,
+                                        unsigned ArgOffset,
+                                        bool ForceMutable) const {
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2563,7 +2615,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                      getPointerTy());
  
      SmallVector<SDValue, 4> MemOps;
-    for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
+    for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) {
        const TargetRegisterClass *RC;
        if (AFI->isThumb1OnlyFunction())
          RC = &ARM::tGPRRegClass;
@@ -2574,7 +2626,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
        SDValue Store =
          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                 MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
+                     MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
                       false, false, 0);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
@@ -2585,7 +2637,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                            &MemOps[0], MemOps.size());
    } else
      // This will point to the next argument passed via stack.
-    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
+    AFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, ArgOffset, !ForceMutable));
  }
  
  SDValue
@@ -2608,14 +2661,16 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
    CCInfo.AnalyzeFormalArguments(Ins,
                                  CCAssignFnForNode(CallConv, /* Return*/ false,
                                                    isVarArg));
-
+  
    SmallVector<SDValue, 16> ArgValues;
    int lastInsIndex = -1;
-
    SDValue ArgValue;
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
-
+    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
      // Arguments stored in registers.
      if (VA.isRegLoc()) {
        EVT RegVT = VA.getLocVT();
@@ -2709,14 +2764,20 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
            // Since they could be overwritten by lowering of arguments in case of
            // a tail call.
            if (Flags.isByVal()) {
-            unsigned VARegSize, VARegSaveSize;
-            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
-            VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0);
-            unsigned Bytes = Flags.getByValSize() - VARegSize;
-            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
-            int FI = MFI->CreateFixedObject(Bytes,
-                                            VA.getLocMemOffset(), false);
-            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
+            ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+            if (!AFI->getVarArgsFrameIndex()) {
+              VarArgStyleRegisters(CCInfo, DAG,
+                                   dl, Chain, CurOrigArg,
+                                   Ins[VA.getValNo()].PartOffset,
+                                   VA.getLocMemOffset(),
+                                   true /*force mutable frames*/);
+              int VAFrameIndex = AFI->getVarArgsFrameIndex();
+              InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy()));
+            } else {
+              int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                              VA.getLocMemOffset(), false);
+              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));              
+            }
            } else {
              int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
                                              VA.getLocMemOffset(), true);
@@ -2734,7 +2795,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
  
    // varargs
    if (isVarArg)
-    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset());
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0,
+                         CCInfo.getNextStackOffset());
  
    return Chain;
  }
@@ -3503,6 +3565,114 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
    return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
  }
  
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated.  The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] 
+///            [b0 b1 b2 b3 b4 b5 b6 b7]
+///           +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand.  We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+/// 
+/// Trace for v4i16:
+/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
+/// v4i16:Extracted = [k0    k1    k2    k3    ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+                                    BitCounts, DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+  }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand.  The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input    = [v0    v1    ] (vi: 32-bit elements)
+/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ] 
+///            [k0 k1 k2 k3 ]
+///       N1 =+[k1 k0 k3 k2 ]
+///            [k0 k2 k1 k3 ]
+///       N2 =+[k1 k3 k0 k2 ]
+///            [k0    k2    k1    k3    ]
+/// Extended =+[k1    k3    k0    k2    ]
+///            [k0    k2    ]
+/// Extracted=+[k1    k3    ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+                                    DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+  }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+          VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  if (VT.getVectorElementType() == MVT::i32)
+    return lowerCTPOP32BitElements(N, DAG);
+  else
+    return lowerCTPOP16BitElements(N, DAG);
+}
+
  static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                            const ARMSubtarget *ST) {
    EVT VT = N->getValueType(0);
@@ -3894,6 +4064,36 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
    return SDValue();
  }
  
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
  
  static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                         bool &ReverseVEXT, unsigned &Imm) {
@@ -4161,10 +4361,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
    }
  
    // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
    unsigned NumElts = VT.getVectorNumElements();
    bool isOnlyLowElement = true;
    bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
    bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
    SDValue Value;
    for (unsigned i = 0; i < NumElts; ++i) {
      SDValue V = Op.getOperand(i);
@@ -4175,13 +4386,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
      if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
        isConstant = false;
  
-    if (!Value.getNode())
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+    
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
        Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
+    }
    }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
  
-  if (!Value.getNode())
+  if (ValueCounts.size() == 0)
      return DAG.getUNDEF(VT);
  
    if (isOnlyLowElement)
@@ -4191,9 +4410,51 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  
    // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
    // i32 and try again.
-  if (usesOnlyOneValue && EltSize <= 32) {
-    if (!isConstant)
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+        // We need to create a new undef vector to use for the VDUPLANE if the
+        // size of the vector from which we get the value is different than the
+        // size of the vector that we need to create. We will insert the element
+        // such that the register coalescer will remove unnecessary copies.
+        if (VT != Value->getOperand(0).getValueType()) {
+          ConstantSDNode *constIndex;
+          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
+          assert(constIndex && "The index is not a constant!");
+          unsigned index = constIndex->getAPIntValue().getLimitedValue() %
+                             VT.getVectorNumElements();
+          N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
+                        Value, DAG.getConstant(index, MVT::i32)),
+                           DAG.getConstant(index, MVT::i32));
+        } else {
+          N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+        }
+      }
+      else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
      if (VT.getVectorElementType().isFloatingPoint()) {
        SmallVector<SDValue, 8> Ops;
        for (unsigned i = 0; i < NumElts; ++i)
@@ -4205,9 +4466,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        if (Val.getNode())
          return DAG.getNode(ISD::BITCAST, dl, VT, Val);
      }
-    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
-    if (Val.getNode())
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+    }
    }
  
    // If all elements are constants and the case above didn't get hit, fall back
@@ -4590,6 +4853,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      if (isVREVMask(ShuffleMask, VT, 16))
        return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
  
+    if (V2->getOpcode() == ISD::UNDEF &&
+        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
      // Check for Neon shuffles that modify both input vectors in place.
      // If both results are used, i.e., if there are two shuffles with the same
      // source operands and with masks corresponding to both results of one of
@@ -4789,16 +5058,76 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
    return false;
  }
  
-/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending
-/// load, or BUILD_VECTOR with extended elements, return the unextended value.
-static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+                                            const EVT &OrigTy,
+                                            const EVT &ExtTy,
+                                            unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy;
+  EVT NewVT;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Orig Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+    NewVT = MVT::v2i32;
+    break;
+  case MVT::v4i8:
+    NewVT = MVT::v4i16;
+    break;
+  }
+  return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+  SDValue NonExtendingLoad =
+    DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
+                LD->isNonTemporal(), LD->isInvariant(),
+                LD->getAlignment());
+  unsigned ExtOp = 0;
+  switch (LD->getExtensionType()) {
+  default: llvm_unreachable("Unexpected LoadExtType");
+  case ISD::EXTLOAD:
+  case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break;
+  case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break;
+  }
+  MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy;
+  MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy;
+  return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG,
+                                      MemType, ExtType, ExtOp);
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
    if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
-    return N->getOperand(0);
+    return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+                                        N->getOperand(0)->getValueType(0),
+                                        N->getValueType(0),
+                                        N->getOpcode());
+
    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
-    return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
-                       LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
-                       LD->isNonTemporal(), LD->isInvariant(),
-                       LD->getAlignment());
+    return SkipLoadExtensionForVMULL(LD, DAG);
+
    // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
    // have been legalized as a BITCAST from v4i32.
    if (N->getOpcode() == ISD::BITCAST) {
@@ -4853,7 +5182,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    // Multiplications are only custom-lowered for 128-bit vectors so that
    // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
    EVT VT = Op.getValueType();
-  assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
    SDNode *N0 = Op.getOperand(0).getNode();
    SDNode *N1 = Op.getOperand(1).getNode();
    unsigned NewOpc = 0;
@@ -4896,9 +5226,9 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    // Legalize to a VMULL instruction.
    DebugLoc DL = Op.getDebugLoc();
    SDValue Op0;
-  SDValue Op1 = SkipExtension(N1, DAG);
+  SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
    if (!isMLA) {
-    Op0 = SkipExtension(N0, DAG);
+    Op0 = SkipExtensionForVMULL(N0, DAG);
      assert(Op0.getValueType().is64BitVector() &&
             Op1.getValueType().is64BitVector() &&
             "unexpected types for extended operands to VMULL");
@@ -4913,8 +5243,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    //   vaddl q0, d4, d5
    //   vmovl q1, d6
    //   vmul  q0, q0, q1
-  SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
-  SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
+  SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
    EVT Op1VT = Op1.getValueType();
    return DAG.getNode(N0->getOpcode(), DL, VT,
                       DAG.getNode(NewOpc, DL, VT,
@@ -5200,6 +5530,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::SRL_PARTS:
    case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
    case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
    case ISD::SETCC:         return LowerVSETCC(Op, DAG);
    case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
    case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
@@ -5260,6 +5591,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::ATOMIC_CMP_SWAP:
      ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
      return;
+  case ISD::ATOMIC_LOAD_MIN:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_UMIN:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_MAX:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_UMAX:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+    return;
    }
    if (Res.getNode())
      Results.push_back(Res);
@@ -5550,7 +5893,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
    //   ldrex dest, ptr
    //   (sign extend dest, if required)
    //   cmp dest, incr
-  //   cmov.cond scratch2, dest, incr
+  //   cmov.cond scratch2, incr, dest
    //   strex scratch, scratch2, ptr
    //   cmp scratch, #0
    //   bne- loopMBB
@@ -5573,7 +5916,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
    AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
                   .addReg(oldval).addReg(incr));
    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
-         .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
+         .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR);
  
    MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
    if (strOpc == ARM::t2STREX)
@@ -5599,7 +5942,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
  MachineBasicBlock *
  ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                                        unsigned Op1, unsigned Op2,
-                                      bool NeedsCarry, bool IsCmpxchg) const {
+                                      bool NeedsCarry, bool IsCmpxchg,
+                                      bool IsMinMax, ARMCC::CondCodes CC) const {
    // This also handles ATOMIC_SWAP, indicated by Op1==0.
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  
@@ -5628,16 +5972,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
  
    MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
    MachineBasicBlock *contBB = 0, *cont2BB = 0;
-  if (IsCmpxchg) {
+  if (IsCmpxchg || IsMinMax)
      contBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  if (IsCmpxchg)
      cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
-  }
    MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
    MF->insert(It, loopMBB);
-  if (IsCmpxchg) {
-    MF->insert(It, contBB);
-    MF->insert(It, cont2BB);
-  }
+  if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
+  if (IsCmpxchg) MF->insert(It, cont2BB);
    MF->insert(It, exitMBB);
  
    // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -5673,12 +6016,32 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
    // for ldrexd must be different.
    BB = loopMBB;
    // Load
+  unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+  unsigned GPRPair1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+  unsigned GPRPair2;
+  if (IsMinMax) {
+    //We need an extra double register for doing min/max.
+    unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    GPRPair2 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(undef)
+      .addReg(vallo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair2)
+      .addReg(r1)
+      .addReg(valhi)
+      .addImm(ARM::gsub_1);
+  }
+
    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
-                 .addReg(ARM::R2, RegState::Define)
-                 .addReg(ARM::R3, RegState::Define).addReg(ptr));
+                 .addReg(GPRPair0, RegState::Define).addReg(ptr));
    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2);
-  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3);
+  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+    .addReg(GPRPair0, 0, ARM::gsub_0);
+  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+    .addReg(GPRPair0, 0, ARM::gsub_1);
  
    if (IsCmpxchg) {
      // Add early exit
@@ -5697,24 +6060,67 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
      // Copy to physregs for strexd
      unsigned setlo = MI->getOperand(5).getReg();
      unsigned sethi = MI->getOperand(6).getReg();
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi);
+    unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(undef)
+      .addReg(setlo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1)
+      .addReg(r1)
+      .addReg(sethi)
+      .addImm(ARM::gsub_1);
    } else if (Op1) {
      // Perform binary operation
-    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0)
+    unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo)
                     .addReg(destlo).addReg(vallo))
          .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
-    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1)
-                   .addReg(desthi).addReg(valhi)).addReg(0);
+    unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
+                   .addReg(desthi).addReg(valhi))
+        .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
+
+    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(UndefPair)
+      .addReg(tmpRegLo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1)
+      .addReg(r1)
+      .addReg(tmpRegHi)
+      .addImm(ARM::gsub_1);
    } else {
      // Copy to physregs for strexd
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi);
+    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(UndefPair)
+      .addReg(vallo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1)
+      .addReg(r1)
+      .addReg(valhi)
+      .addImm(ARM::gsub_1);
+  }
+  unsigned GPRPairStore = GPRPair1;
+  if (IsMinMax) {
+    // Compare and branch to exit block.
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
+    BB->addSuccessor(exitMBB);
+    BB->addSuccessor(contBB);
+    BB = contBB;
+    GPRPairStore = GPRPair2;
    }
  
    // Store
    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
-                 .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr));
+                 .addReg(GPRPairStore).addReg(ptr));
    // Cmp+jump
    AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                   .addReg(storesuccess).addImm(0));
@@ -5943,12 +6349,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                               MachineMemOperand::MOLoad |
                               MachineMemOperand::MOVolatile, 4, 4);
  
-  if (AFI->isThumb1OnlyFunction())
-    BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup));
-  else if (!Subtarget->hasVFP2())
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp));
-  else
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  MIB.addRegMask(RI.getNoPreservedMask());
  
    unsigned NumLPads = LPadList.size();
    if (Subtarget->isThumb2()) {
@@ -6020,9 +6429,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
        const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
  
        // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
        if (Align == 0)
-        Align = getTargetData()->getTypeAllocSize(C->getType());
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
        unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
  
        unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -6109,9 +6518,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
        const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
  
        // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
        if (Align == 0)
-        Align = getTargetData()->getTypeAllocSize(C->getType());
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
        unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
  
        unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -6167,8 +6576,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
    }
  
    // N.B. the order the invoke BBs are processed in doesn't matter here.
-  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
-  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
    const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
    SmallVector<MachineBasicBlock*, 64> MBBLPads;
    for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
@@ -6282,7 +6689,8 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
      UnitSize = 2;
    } else {
      // Check whether we can use NEON instructions.
-    if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) &&
+    if (!MF->getFunction()->getFnAttributes().
+          hasAttribute(Attributes::NoImplicitFloat) &&
          Subtarget->hasNEON()) {
        if ((Align % 16 == 0) && SizeVal >= 16) {
          ldrOpc = ARM::VLD1q32wb_fixed;
@@ -6367,7 +6775,8 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
        } else {
          AddDefaultPred(BuildMI(*BB, MI, dl,
            TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+          .addReg(srcOut, RegState::Define).addReg(srcIn)
+          .addReg(0).addImm(1));
  
          AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
            .addReg(scratch).addReg(destIn)
@@ -6430,9 +6839,9 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
      const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
  
      // MachineConstantPool wants an explicit alignment.
-    unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
      if (Align == 0)
-      Align = getTargetData()->getTypeAllocSize(C->getType());
+      Align = getDataLayout()->getTypeAllocSize(C->getType());
      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
  
      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
@@ -6709,6 +7118,26 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                                isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                                /*NeedsCarry*/ false, /*IsCmpxchg*/true);
+  case ARM::ATOMMIN6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::LE);
+  case ARM::ATOMMAX6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::GE);
+  case ARM::ATOMUMIN6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::LS);
+  case ARM::ATOMUMAX6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::HS);
  
    case ARM::tMOVCCr_pseudo: {
      // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -8980,20 +9409,36 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
    return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
  }
  
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
-  if (!Subtarget->allowsUnalignedMem())
-    return false;
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
  
    switch (VT.getSimpleVT().SimpleTy) {
    default:
      return false;
    case MVT::i8:
    case MVT::i16:
-  case MVT::i32:
-    return true;
+  case MVT::i32: {
+    // Unaligned access can use (for example) LRDB, LRDH, LDR
+    if (AllowsUnaligned) {
+      if (Fast)
+        *Fast = Subtarget->hasV7Ops();
+      return true;
+    }
+    return false;
+  }
    case MVT::f64:
-    return Subtarget->hasNEON();
-  // FIXME: VLD1 etc with standard alignment is legal.
+  case MVT::v2f64: {
+    // For any little-endian targets with neon, we can support unaligned ld/st
+    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+    // A big-endian target may also explictly support unaligned accesses
+    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+      if (Fast)
+        *Fast = true;
+      return true;
+    }
+    return false;
+  }
    }
  }
  
@@ -9005,33 +9450,58 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
  
  EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
                                             unsigned DstAlign, unsigned SrcAlign,
-                                           bool IsZeroVal,
+                                           bool IsMemset, bool ZeroMemset,
                                             bool MemcpyStrSrc,
                                             MachineFunction &MF) const {
    const Function *F = MF.getFunction();
  
    // See if we can use NEON instructions for this...
-  if (IsZeroVal &&
-      !F->hasFnAttr(Attribute::NoImplicitFloat) &&
-      Subtarget->hasNEON()) {
-    if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
-      return MVT::v4i32;
-    } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) {
-      return MVT::v2i32;
+  if ((!IsMemset || ZeroMemset) &&
+      Subtarget->hasNEON() &&
+      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
+    bool Fast;
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
+      return MVT::v2f64;
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
+                (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
+      return MVT::f64;
      }
    }
  
    // Lowering to i32/i16 if the size permits.
-  if (Size >= 4) {
+  if (Size >= 4)
      return MVT::i32;
-  } else if (Size >= 2) {
+  else if (Size >= 2)
      return MVT::i16;
-  }
  
    // Let the target-independent logic figure it out.
    return MVT::Other;
  }
  
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+    return true;
+  }
+
+  return false;
+}
+
  static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
    if (V < 0)
      return false;
@@ -9806,7 +10276,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
    case Intrinsic::arm_neon_vld4lane: {
      Info.opc = ISD::INTRINSIC_W_CHAIN;
      // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(0);
      Info.offset = 0;
@@ -9831,7 +10301,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
        Type *ArgTy = I.getArgOperand(ArgI)->getType();
        if (!ArgTy->isVectorTy())
          break;
-      NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
      }
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(0);
@@ -9871,3 +10341,36 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
  
    return false;
  }
+
+unsigned
+ARMScalarTargetTransformImpl::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return 4;
+
+  int32_t SImmVal = Imm.getSExtValue();
+  uint32_t ZImmVal = Imm.getZExtValue();
+  if (!Subtarget->isThumb()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
+      return 1;
+    return Subtarget->hasV6T2Ops() ? 2 : 3;
+  } else if (Subtarget->isThumb2()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
+      return 1;
+    return Subtarget->hasV6T2Ops() ? 2 : 3;
+  } else /*Thumb1*/ {
+    if (SImmVal >= 0 && SImmVal < 256)
+      return 1;
+    if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+      return 2;
+    // Load from constantpool.
+    return 3;
+  }
+  return 2;
+}