Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and ARM

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index 853716b8f6219c81aa40544f10910e1e582b9e53..563b37e6cba8747459498bc8c6cad78b4f763f55 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -393,6 +393,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
    }
  
+  if (HasDivModLibcall) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
    if (Subtarget->isThumb1Only())
      addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
    else
@@ -461,6 +466,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
      setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
      setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
+    // a destination type that is wider than the source.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
  
      setTargetDAGCombine(ISD::INTRINSIC_VOID);
      setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
@@ -778,7 +787,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
    case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
    case ARMISD::CMOV:          return "ARMISD::CMOV";
-  case ARMISD::CNEG:          return "ARMISD::CNEG";
  
    case ARMISD::RBIT:          return "ARMISD::RBIT";
  
@@ -853,6 +861,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::VZIP:          return "ARMISD::VZIP";
    case ARMISD::VUZP:          return "ARMISD::VUZP";
    case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
+  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
    case ARMISD::VMULLs:        return "ARMISD::VMULLs";
    case ARMISD::VMULLu:        return "ARMISD::VMULLu";
    case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
@@ -861,6 +871,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::BFI:           return "ARMISD::BFI";
    case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
    case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VBSL:          return "ARMISD::VBSL";
    case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
    case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
    case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
@@ -946,27 +957,6 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
    return Sched::RegPressure;
  }
  
-// FIXME: Move to RegInfo
-unsigned
-ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
-                                       MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case ARM::tGPRRegClassID:
-    return TFI->hasFP(MF) ? 4 : 5;
-  case ARM::GPRRegClassID: {
-    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
-    return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0);
-  }
-  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
-  case ARM::DPRRegClassID:
-    return 32 - 10;
-  }
-}
-
  //===----------------------------------------------------------------------===//
  // Lowering Code
  //===----------------------------------------------------------------------===//
@@ -1825,6 +1815,16 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const {
    return HasRet;
  }
  
+bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!EnableARMTailCalls)
+    return false;
+
+  if (!CI->isTailCall())
+    return false;
+
+  return !Subtarget->isThumb1Only();
+}
+
  // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
  // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
  // one of the above mentioned nodes. It has to be wrapped because otherwise
@@ -2108,7 +2108,7 @@ ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
    const {
    DebugLoc dl = Op.getDebugLoc();
    return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                     Op.getOperand(0), Op.getOperand(1));
+                     Op.getOperand(0));
  }
  
  SDValue
@@ -2163,6 +2163,13 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
      }
      return Result;
    }
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
+      ? ARMISD::VMULLs : ARMISD::VMULLu;
+    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
    }
  }
  
@@ -2378,7 +2385,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
        assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
  
        int index = ArgLocs[i].getValNo();
-      
+
        // Some Ins[] entries become multiple ArgLoc[] entries.
        // Process them only once.
        if (index != lastInsIndex)
@@ -2389,8 +2396,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
            // In case of tail call optimization mark all arguments mutable. Since they
            // could be overwritten by lowering of arguments in case of a tail call.
            if (Flags.isByVal()) {
-            int FI = MFI->CreateFixedObject(Flags.getByValSize(),
-                                            VA.getLocMemOffset(), false);
+            unsigned Bytes = Flags.getByValSize();
+            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+            int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), false);
              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
            } else {
              int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
@@ -2549,6 +2557,27 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
    return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
  }
  
+/// duplicateCmp - Glue values can have only one use, so this function
+/// duplicates a comparison node.
+SDValue
+ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
+  unsigned Opc = Cmp.getOpcode();
+  DebugLoc DL = Cmp.getDebugLoc();
+  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
+    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+
+  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
+  Cmp = Cmp.getOperand(0);
+  Opc = Cmp.getOpcode();
+  if (Opc == ARMISD::CMPFP)
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+  else {
+    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+  }
+  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+}
+
  SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    SDValue Cond = Op.getOperand(0);
    SDValue SelectTrue = Op.getOperand(1);
@@ -2584,7 +2613,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
          EVT VT = Cond.getValueType();
          SDValue ARMcc = Cond.getOperand(2);
          SDValue CCR = Cond.getOperand(3);
-        SDValue Cmp = Cond.getOperand(4);
+        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
          return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
        }
      }
@@ -2713,8 +2742,8 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
        // If one of the operand is zero, it's safe to ignore the NaN case since
        // we only care about equality comparisons.
        (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) {
-    // If unsafe fp math optimization is enabled and there are no othter uses of
-    // the CMP operands, and the condition code is EQ oe NE, we can optimize it
+    // If unsafe fp math optimization is enabled and there are no other uses of
+    // the CMP operands, and the condition code is EQ or NE, we can optimize it
      // to an integer comparison.
      if (CC == ISD::SETOEQ)
        CC = ISD::SETEQ;
@@ -2843,8 +2872,39 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
    return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
  }
  
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT OperandVT = Op.getOperand(0).getValueType();
+  assert(OperandVT == MVT::v4i16 && "Invalid type for custom lowering!");
+  if (VT != MVT::v4f32)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  unsigned CastOpc;
+  unsigned Opc;
+  switch (Op.getOpcode()) {
+  default:
+    assert(0 && "Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    CastOpc = ISD::SIGN_EXTEND;
+    Opc = ISD::SINT_TO_FP;
+    break;
+  case ISD::UINT_TO_FP:
+    CastOpc = ISD::ZERO_EXTEND;
+    Opc = ISD::UINT_TO_FP;
+    break;
+  }
+
+  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
  static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
    DebugLoc dl = Op.getDebugLoc();
    unsigned Opc;
  
@@ -2901,7 +2961,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
      AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
      SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
                                    DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
-                                              
+
      SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
                                DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
                                DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
@@ -3540,6 +3600,13 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
    return true;
  }
  
+static bool isVTBLMask(const SmallVectorImpl<int> &M, EVT VT) {
+  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
+  // range, then 0 is placed into the resulting vector. So pretty much any mask
+  // of 8 elements can work here.
+  return VT == MVT::v8i8 && M.size() == 8;
+}
+
  static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
                         unsigned &WhichResult) {
    unsigned EltSz = VT.getVectorElementType().getSizeInBits();
@@ -3979,6 +4046,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isVREVMask(M, VT, 32) ||
            isVREVMask(M, VT, 16) ||
            isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTBLMask(M, VT) ||
            isVTRNMask(M, VT, WhichResult) ||
            isVUZPMask(M, VT, WhichResult) ||
            isVZIPMask(M, VT, WhichResult) ||
@@ -4056,6 +4124,29 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
    }
  }
  
+static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
+                                       SmallVectorImpl<int> &ShuffleMask,
+                                       SelectionDAG &DAG) {
+  // Check to see if we can use the VTBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc DL = Op.getDebugLoc();
+
+  SmallVector<SDValue, 8> VTBLMask;
+  for (SmallVectorImpl<int>::iterator
+         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
+
+  if (V2.getNode()->getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
+                                   &VTBLMask[0], 8));
+
+  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
+                                 &VTBLMask[0], 8));
+}
+
  static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
@@ -4173,6 +4264,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(ISD::BITCAST, dl, VT, Val);
    }
  
+  if (VT == MVT::v8i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
    return SDValue();
  }
  
@@ -4322,6 +4419,28 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
                       MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
  }
  
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
  static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    // Multiplications are only custom-lowered for 128-bit vectors so that
    // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
@@ -4330,29 +4449,73 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    SDNode *N0 = Op.getOperand(0).getNode();
    SDNode *N1 = Op.getOperand(1).getNode();
    unsigned NewOpc = 0;
-  if (isSignExtended(N0, DAG) && isSignExtended(N1, DAG))
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
      NewOpc = ARMISD::VMULLs;
-  else if (isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG))
-    NewOpc = ARMISD::VMULLu;
-  else if (VT == MVT::v2i64)
-    // Fall through to expand this.  It is not legal.
-    return SDValue();
-  else
-    // Other vector multiplications are legal.
-    return Op;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = ARMISD::VMULLu;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLs;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
  
    // Legalize to a VMULL instruction.
    DebugLoc DL = Op.getDebugLoc();
-  SDValue Op0 = SkipExtension(N0, DAG);
+  SDValue Op0;
    SDValue Op1 = SkipExtension(N1, DAG);
-
-  assert(Op0.getValueType().is64BitVector() &&
-         Op1.getValueType().is64BitVector() &&
-         "unexpected types for extended operands to VMULL");
-  return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  if (!isMLA) {
+    Op0 = SkipExtension(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+
+  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back vmul + vmla.
+  //   vmull q0, d4, d6
+  //   vmlal q0, d5, d6
+  // is faster than
+  //   vaddl q0, d4, d5
+  //   vmovl q1, d6
+  //   vmul  q0, q0, q1
+  SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
  }
  
-static SDValue 
+static SDValue
  LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
    // Convert to float
    // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
@@ -4363,7 +4526,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
    Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
    // Get reciprocal estimate.
    // float4 recip = vrecpeq_f32(yf);
-  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                     DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
    // Because char has a smaller range than uchar, we can actually get away
    // without any newton steps.  This requires that we use a weird bias
@@ -4381,7 +4544,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
    return X;
  }
  
-static SDValue 
+static SDValue
  LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
    SDValue N2;
    // Convert to float.
@@ -4391,13 +4554,13 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
    N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
    N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
-  
+
    // Use reciprocal estimate and one refinement step.
    // float4 recip = vrecpeq_f32(yf);
    // recip *= vrecpsq_f32(yf, recip);
-  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                     DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
-  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                     DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
                     N1, N2);
    N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
@@ -4427,15 +4590,15 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
    SDValue N0 = Op.getOperand(0);
    SDValue N1 = Op.getOperand(1);
    SDValue N2, N3;
-  
+
    if (VT == MVT::v8i8) {
      N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
      N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
-    
+
      N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                       DAG.getIntPtrConstant(4));
      N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(4)); 
+                     DAG.getIntPtrConstant(4));
      N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                       DAG.getIntPtrConstant(0));
      N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
@@ -4446,7 +4609,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
  
      N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
      N0 = LowerCONCAT_VECTORS(N0, DAG);
-    
+
      N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
      return N0;
    }
@@ -4462,32 +4625,32 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
    SDValue N0 = Op.getOperand(0);
    SDValue N1 = Op.getOperand(1);
    SDValue N2, N3;
-  
+
    if (VT == MVT::v8i8) {
      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
      N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
-    
+
      N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                       DAG.getIntPtrConstant(4));
      N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
-                     DAG.getIntPtrConstant(4)); 
+                     DAG.getIntPtrConstant(4));
      N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                       DAG.getIntPtrConstant(0));
      N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
                       DAG.getIntPtrConstant(0));
-    
+
      N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
      N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
-    
+
      N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
      N0 = LowerCONCAT_VECTORS(N0, DAG);
-    
-    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 
+
+    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
                       DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
                       N0);
      return N0;
    }
-  
+
    // v4i16 sdiv ... Convert to float.
    // float4 yf = vcvt_f32_s32(vmovl_u16(y));
    // float4 xf = vcvt_f32_s32(vmovl_u16(x));
@@ -4500,13 +4663,13 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
    // float4 recip = vrecpeq_f32(yf);
    // recip *= vrecpsq_f32(yf, recip);
    // recip *= vrecpsq_f32(yf, recip);
-  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                     DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
-  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                     DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
                     N1, N2);
    N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
-  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                     DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
                     N1, N2);
    N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
@@ -4535,7 +4698,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::GlobalAddress:
      return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
        LowerGlobalAddressELF(Op, DAG);
-  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
    case ISD::SELECT:        return LowerSELECT(Op, DAG);
    case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
    case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
@@ -4556,7 +4719,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG);
    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                 Subtarget);
-  case ISD::BITCAST:   return ExpandBITCAST(Op.getNode(), DAG);
+  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
    case ISD::SHL:
    case ISD::SRL:
    case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
@@ -4856,6 +5019,68 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
    case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
  
+  case ARM::ADCSSri:
+  case ARM::ADCSSrr:
+  case ARM::ADCSSrs:
+  case ARM::SBCSSri:
+  case ARM::SBCSSrr:
+  case ARM::SBCSSrs:
+  case ARM::RSBSri:
+  case ARM::RSBSrr:
+  case ARM::RSBSrs:
+  case ARM::RSCSri:
+  case ARM::RSCSrs: {
+    unsigned OldOpc = MI->getOpcode();
+    unsigned Opc = 0;
+    switch (OldOpc) {
+      case ARM::ADCSSrr:
+        Opc = ARM::ADCrr;
+        break;
+      case ARM::ADCSSri:
+        Opc = ARM::ADCri;
+        break;
+      case ARM::ADCSSrs:
+        Opc = ARM::ADCrs;
+        break;
+      case ARM::SBCSSrr:
+        Opc = ARM::SBCrr;
+        break;
+      case ARM::SBCSSri:
+        Opc = ARM::SBCri;
+        break;
+      case ARM::SBCSSrs:
+        Opc = ARM::SBCrs;
+        break;
+      case ARM::RSBSri:
+        Opc = ARM::RSBri;
+        break;
+      case ARM::RSBSrr:
+        Opc = ARM::RSBrr;
+        break;
+      case ARM::RSBSrs:
+        Opc = ARM::RSBrs;
+        break;
+      case ARM::RSCSri:
+        Opc = ARM::RSCri;
+        break;
+      case ARM::RSCSrs:
+        Opc = ARM::RSCrs;
+        break;
+      default:
+        llvm_unreachable("Unknown opcode?");
+    }
+
+    MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(Opc));
+    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+      MIB.addOperand(MI->getOperand(i));
+    AddDefaultPred(MIB);
+    MIB.addReg(ARM::CPSR, RegState::Define); // S bit
+    MI->eraseFromParent();
+    return BB;
+  }
+
+
    case ARM::tMOVCCr_pseudo: {
      // To "insert" a SELECT_CC instruction, we actually have to insert the
      // diamond control-flow pattern.  The incoming instruction knows the
@@ -5066,6 +5291,42 @@ static SDValue PerformSUBCombine(SDNode *N,
    return SDValue();
  }
  
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+///   vmul d3, d0, d2
+///   vmla d3, d1, d2
+/// is faster than
+///   vadd d3, d0, d1
+///   vmul d3, d3, d2
+static SDValue PerformVMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasVMLxForwarding())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+    Opcode = N0.getOpcode();
+    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+        Opcode != ISD::FADD && Opcode != ISD::FSUB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  return DAG.getNode(Opcode, DL, VT,
+                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
  static SDValue PerformMULCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -5078,6 +5339,8 @@ static SDValue PerformMULCombine(SDNode *N,
      return SDValue();
  
    EVT VT = N->getValueType(0);
+  if (VT.is64BitVector() || VT.is128BitVector())
+    return PerformVMULCombine(N, DCI, Subtarget);
    if (VT != MVT::i32)
      return SDValue();
  
@@ -5120,12 +5383,16 @@ static SDValue PerformMULCombine(SDNode *N,
  
  static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
+
    // Attempt to use immediate-form VBIC
    BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
    DebugLoc dl = N->getDebugLoc();
    EVT VT = N->getValueType(0);
    SelectionDAG &DAG = DCI.DAG;
  
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+  
    APInt SplatBits, SplatUndef;
    unsigned SplatBitSize;
    bool HasAnyUndefs;
@@ -5159,6 +5426,9 @@ static SDValue PerformORCombine(SDNode *N,
    EVT VT = N->getValueType(0);
    SelectionDAG &DAG = DCI.DAG;
  
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+  
    APInt SplatBits, SplatUndef;
    unsigned SplatBitSize;
    bool HasAnyUndefs;
@@ -5179,6 +5449,37 @@ static SDValue PerformORCombine(SDNode *N,
      }
    }
  
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+  SDValue N1 = N->getOperand(1);
+
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    APInt SplatBits0;
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                  HasAnyUndefs) && !HasAnyUndefs) {
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+      APInt SplatBits1;
+      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs) && !HasAnyUndefs &&
+          SplatBits0 == ~SplatBits1) {
+        // Canonicalize the vector type to make instruction selection simpler.
+        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                     N0->getOperand(1), N0->getOperand(0),
+                                     N1->getOperand(1));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      }
+    }
+  }
+
    // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
    // reasonable.
  
@@ -5186,19 +5487,16 @@ static SDValue PerformORCombine(SDNode *N,
    if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
      return SDValue();
  
-  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
    DebugLoc DL = N->getDebugLoc();
    // 1) or (and A, mask), val => ARMbfi A, val, mask
    //      iff (val & mask) == val
    //
    // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
    //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
-  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //          && mask == ~mask2
    //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
-  //          && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+  //          && ~mask == mask2
    //  (i.e., copy a bitfield value into another bitfield of the same width)
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
  
    if (VT != MVT::i32)
      return SDValue();
@@ -5241,26 +5539,26 @@ static SDValue PerformORCombine(SDNode *N,
        return SDValue();
      unsigned Mask2 = N11C->getZExtValue();
  
+    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
+    // as is to match.
      if (ARM::isBitFieldInvertedMask(Mask) &&
-        ARM::isBitFieldInvertedMask(~Mask2) &&
-        (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) {
+        (Mask == ~Mask2)) {
        // The pack halfword instruction works better for masks that fit it,
        // so use that when it's available.
        if (Subtarget->hasT2ExtractPack() &&
            (Mask == 0xffff || Mask == 0xffff0000))
          return SDValue();
        // 2a
-      unsigned lsb = CountTrailingZeros_32(Mask2);
+      unsigned amt = CountTrailingZeros_32(Mask2);
        Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
-                        DAG.getConstant(lsb, MVT::i32));
+                        DAG.getConstant(amt, MVT::i32));
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
                          DAG.getConstant(Mask, MVT::i32));
        // Do not add new nodes to DAG combiner worklist.
        DCI.CombineTo(N, Res, false);
        return SDValue();
      } else if (ARM::isBitFieldInvertedMask(~Mask) &&
-               ARM::isBitFieldInvertedMask(Mask2) &&
-               (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
+               (~Mask == Mask2)) {
        // The pack halfword instruction works better for masks that fit it,
        // so use that when it's available.
        if (Subtarget->hasT2ExtractPack() &&
@@ -5271,7 +5569,7 @@ static SDValue PerformORCombine(SDNode *N,
        Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                          DAG.getConstant(lsb, MVT::i32));
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
-                                DAG.getConstant(Mask2, MVT::i32));
+                        DAG.getConstant(Mask2, MVT::i32));
        // Do not add new nodes to DAG combiner worklist.
        DCI.CombineTo(N, Res, false);
        return SDValue();
@@ -5326,6 +5624,37 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
    SDValue InDouble = N->getOperand(0);
    if (InDouble.getOpcode() == ARMISD::VMOVDRR)
      return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+  // vmovrrd(load f64) -> (load i32), (load i32)
+  SDNode *InNode = InDouble.getNode();
+  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+      InNode->getValueType(0) == MVT::f64 &&
+      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+      !cast<LoadSDNode>(InNode)->isVolatile()) {
+    // TODO: Should this be done for non-FrameIndex operands?
+    LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc DL = LD->getDebugLoc();
+    SDValue BasePtr = LD->getBasePtr();
+    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(), LD->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(),
+                                 std::min(4U, LD->getAlignment() / 2));
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+    DCI.RemoveFromWorklist(LD);
+    DAG.DeleteNode(LD);
+    return Result;
+  }
+
    return SDValue();
  }
  
@@ -5355,8 +5684,28 @@ static SDValue PerformSTORECombine(SDNode *N,
    // Otherwise, the i64 value will be legalized to a pair of i32 values.
    StoreSDNode *St = cast<StoreSDNode>(N);
    SDValue StVal = St->getValue();
-  if (!ISD::isNormalStore(St) || St->isVolatile() ||
-      StVal.getValueType() != MVT::i64 ||
+  if (!ISD::isNormalStore(St) || St->isVolatile())
+    return SDValue();
+
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse() && !St->isVolatile()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    DebugLoc DL = St->getDebugLoc();
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(0), BasePtr,
+                                  St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() != MVT::i64 ||
        StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
      return SDValue();
  
@@ -5585,7 +5934,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
      EVT VecTy;
      if (isLoad)
        VecTy = N->getValueType(0);
-    else 
+    else
        VecTy = N->getOperand(AddrOpIdx+1).getValueType();
      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
      if (isLaneOp)
@@ -5635,7 +5984,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
      DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
  
      break;
-  } 
+  }
    return SDValue();
  }