For pre-v6t2 targets, only select MOVi32imm if the immediate can be handled with...

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index 4a7866592d65301d98fbd4f1d9616746f5a539d8..ca0bca4445a413edb9c0f7a8d6266a4251c7a986 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -84,8 +84,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
    EVT ElemTy = VT.getVectorElementType();
    if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
      setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
-  if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
    if (ElemTy != MVT::i32) {
      setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
      setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
@@ -104,6 +103,10 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
      setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
      setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand);
      setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand);
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction(VT.getSimpleVT(),
+                          (MVT::SimpleValueType)InnerVT, Expand);
    }
    setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
  
@@ -460,6 +463,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setTargetDAGCombine(ISD::ANY_EXTEND);
      setTargetDAGCombine(ISD::SELECT_CC);
      setTargetDAGCombine(ISD::BUILD_VECTOR);
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
    }
  
    computeRegisterProperties();
@@ -549,7 +553,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
    // the default expansion.
    if (Subtarget->hasDataBarrier() ||
-      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())) {
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
      // membarrier needs custom lowering; the rest are legal and handled
      // normally.
      setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
@@ -593,6 +597,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
  
+  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
+
    // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
    if (!Subtarget->hasV6Ops()) {
      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -667,8 +673,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setTargetDAGCombine(ISD::SUB);
    setTargetDAGCombine(ISD::MUL);
  
-  if (Subtarget->hasV6T2Ops())
+  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON())
      setTargetDAGCombine(ISD::OR);
+  if (Subtarget->hasNEON())
+    setTargetDAGCombine(ISD::AND);
  
    setStackPointerRegisterToSaveRestore(ARM::SP);
  
@@ -765,7 +773,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
  
    case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
-  case ARMISD::SYNCBARRIER:   return "ARMISD::SYNCBARRIER";
+  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
  
    case ARMISD::VCEQ:          return "ARMISD::VCEQ";
    case ARMISD::VCGE:          return "ARMISD::VCGE";
@@ -812,6 +822,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::FMAX:          return "ARMISD::FMAX";
    case ARMISD::FMIN:          return "ARMISD::FMIN";
    case ARMISD::BFI:           return "ARMISD::BFI";
+  case ARMISD::VORRIMM:        return "ARMISD::VORRIMM";
    }
  }
  
@@ -854,6 +865,8 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
  
    for (unsigned i = 0; i != NumVals; ++i) {
      EVT VT = N->getValueType(i);
+    if (VT == MVT::Flag || VT == MVT::Other)
+      continue;
      if (VT.isFloatingPoint() || VT.isVector())
        return Sched::Latency;
    }
@@ -865,11 +878,13 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
    // is not available.
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
-  if (TID.mayLoad())
-    return Sched::Latency;
  
-  if (!Itins->isEmpty() && Itins->getStageLatency(TID.getSchedClass()) > 2)
+  if (TID.getNumDefs() == 0)
+    return Sched::RegPressure;
+  if (!Itins->isEmpty() &&
+      Itins->getOperandCycle(TID.getSchedClass(), 0) > 2)
      return Sched::Latency;
+
    return Sched::RegPressure;
  }
  
@@ -1766,7 +1781,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
    unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
    ARMConstantPoolValue *CPV =
      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                             ARMCP::CPValue, PCAdj, "tlsgd", true);
+                             ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
    SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
    Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
    Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
@@ -1813,7 +1828,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
      unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
      ARMConstantPoolValue *CPV =
        new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                               ARMCP::CPValue, PCAdj, "gottpoff", true);
+                               ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true);
      Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
      Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
      Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -1829,7 +1844,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                           false, false, 0);
    } else {
      // local exec model
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff");
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF);
      Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
      Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
      Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -1865,7 +1880,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
    if (RelocM == Reloc::PIC_) {
      bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
      ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT");
+      new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
      SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
@@ -2021,21 +2036,55 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
  static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
                                 const ARMSubtarget *Subtarget) {
    DebugLoc dl = Op.getDebugLoc();
-  SDValue Op5 = Op.getOperand(5);
-  unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
-  // Some subtargets which have dmb and dsb instructions can handle barriers
-  // directly. Some ARMv6 cpus can support them with the help of mcr
-  // instruction. Thumb1 and pre-v6 ARM mode use a libcall instead and should
-  // never get here.
-  unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER;
-  if (Subtarget->hasDataBarrier())
-    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0));
-  else {
-    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb1Only() &&
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
             "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
-    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0),
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
                         DAG.getConstant(0, MVT::i32));
    }
+
+  SDValue Op5 = Op.getOperand(5);
+  bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0;
+  unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0);
+
+  ARM_MB::MemBOpt DMBOpt;
+  if (isDeviceBarrier)
+    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY;
+  else
+    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH;
+  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(DMBOpt, MVT::i32));
+}
+
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
+  // ARM pre v5TE and Thumb1 does not have preload instructions.
+  if (!(Subtarget->isThumb2() ||
+        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  if (!isRead &&
+      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
+    // ARMv7 with MP extension has PLDW.
+    return Op.getOperand(0);
+
+  if (Subtarget->isThumb())
+    // Invert the bits.
+    isRead = ~isRead & 1;
+  unsigned isData = Subtarget->isThumb() ? 0 : 1;
+
+  // Currently there is no intrinsic that matches pli.
+  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
+                     DAG.getConstant(isData, MVT::i32));
  }
  
  static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -3025,7 +3074,38 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
    if (Swap)
      std::swap(Op0, Op1);
  
-  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+  // If one of the operands is a constant vector zero, attempt to fold the
+  // comparison to a specialized compare-against-zero form.
+  SDValue SingleOp;
+  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+    SingleOp = Op0;
+  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+    if (Opc == ARMISD::VCGE)
+      Opc = ARMISD::VCLEZ;
+    else if (Opc == ARMISD::VCGT)
+      Opc = ARMISD::VCLTZ;
+    SingleOp = Op1;
+  }
+  
+  SDValue Result;
+  if (SingleOp.getNode()) {
+    switch (Opc) {
+    case ARMISD::VCEQ:
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGE:
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLEZ:
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGT:
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLTZ:
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
+    default:
+      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+    }
+  } else {
+     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+  }
  
    if (Invert)
      Result = DAG.getNOT(dl, Result, VT);
@@ -3038,7 +3118,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
  /// operand (e.g., VMOV).  If so, return the encoded value.
  static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                   unsigned SplatBitSize, SelectionDAG &DAG,
-                                 EVT &VT, bool is128Bits, bool isVMOV) {
+                                 EVT &VT, bool is128Bits, NEONModImmType type) {
    unsigned OpCmode, Imm;
  
    // SplatBitSize is set to the smallest size that splats the vector, so a
@@ -3051,7 +3131,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
  
    switch (SplatBitSize) {
    case 8:
-    if (!isVMOV)
+    if (type != VMOVModImm)
        return SDValue();
      // Any 1-byte value is OK.  Op=0, Cmode=1110.
      assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
@@ -3108,6 +3188,9 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
        break;
      }
  
+    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+    if (type == OtherModImm) return SDValue();
+
      if ((SplatBits & ~0xffff) == 0 &&
          ((SplatBits | SplatUndef) & 0xff) == 0xff) {
        // Value = 0x0000nnff: Op=x, Cmode=1100.
@@ -3134,7 +3217,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
      return SDValue();
  
    case 64: {
-    if (!isVMOV)
+    if (type != VMOVModImm)
        return SDValue();
      // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
      uint64_t BitMask = 0xff;
@@ -3403,7 +3486,8 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        EVT VmovVT;
        SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                        SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VmovVT, VT.is128BitVector(), true);
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMOVModImm);
        if (Val.getNode()) {
          SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
          return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov);
@@ -3414,7 +3498,8 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               ((1LL << SplatBitSize) - 1));
        Val = isNEONModifiedImm(NegatedImm,
                                        SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, VmovVT, VT.is128BitVector(), false);
+                                      DAG, VmovVT, VT.is128BitVector(), 
+                                      VMVNModImm);
        if (Val.getNode()) {
          SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
          return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vmov);
@@ -3461,8 +3546,8 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        for (unsigned i = 0; i < NumElts; ++i)
          Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
                                    Op.getOperand(i)));
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &Ops[0],
-                                NumElts);
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
        Val = LowerBUILD_VECTOR(Val, DAG, ST);
        if (Val.getNode())
          return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
@@ -3731,14 +3816,19 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Vec = Op.getOperand(0);
+  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
    SDValue Lane = Op.getOperand(1);
-  assert(VT == MVT::i32 &&
-         Vec.getValueType().getVectorElementType().getSizeInBits() < 32 &&
-         "unexpected type for custom-lowering vector extract");
-  return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  SDValue Vec = Op.getOperand(0);
+  if (Op.getValueType() == MVT::i32 &&
+      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  }
+
+  return Op;
  }
  
  static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
@@ -3786,7 +3876,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    } else if ((N0->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N0)) &&
               (N1->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N1))) {
      NewOpc = ARMISD::VMULLu;
-  } else if (VT.getSimpleVT().SimpleTy == MVT::v2i64) {
+  } else if (VT == MVT::v2i64) {
      // Fall through to expand this.  It is not legal.
      return SDValue();
    } else {
@@ -3820,6 +3910,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
    case ISD::VASTART:       return LowerVASTART(Op, DAG);
    case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
+  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
    case ISD::FP_TO_SINT:
@@ -4390,10 +4481,67 @@ static SDValue PerformMULCombine(SDNode *N,
    return SDValue();
  }
  
+static SDValue PerformANDCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // Attempt to use immediate-form VBIC
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VbicVT;
+      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VbicVT, VT.is128BitVector(), 
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BIT_CONVERT, dl, VbicVT, N->getOperand(0));
+        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vbic);
+      }
+    }
+  }
+  
+  return SDValue();
+}
+
  /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
  static SDValue PerformORCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
+  // Attempt to use immediate-form VORR
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && Subtarget->hasNEON() &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VorrVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BIT_CONVERT, dl, VorrVT, N->getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vorr);
+      }
+    }
+  }
+
    // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
    // reasonable.
  
@@ -4401,7 +4549,6 @@ static SDValue PerformORCombine(SDNode *N,
    if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
      return SDValue();
  
-  SelectionDAG &DAG = DCI.DAG;
    SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
    DebugLoc DL = N->getDebugLoc();
    // 1) or (and A, mask), val => ARMbfi A, val, mask
@@ -4416,7 +4563,6 @@ static SDValue PerformORCombine(SDNode *N,
    if (N0.getOpcode() != ISD::AND)
      return SDValue();
  
-  EVT VT = N->getValueType(0);
    if (VT != MVT::i32)
      return SDValue();
  
@@ -4485,7 +4631,7 @@ static SDValue PerformORCombine(SDNode *N,
        DCI.CombineTo(N, Res, false);
      }
    }
-
+  
    return SDValue();
  }
  
@@ -4531,6 +4677,59 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG) {
    return SDValue();
  }
  
+/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
+/// ISD::VECTOR_SHUFFLE.
+static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+  // The LLVM shufflevector instruction does not require the shuffle mask
+  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
+  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
+  // operands do not match the mask length, they are extended by concatenating
+  // them with undef vectors.  That is probably the right thing for other
+  // targets, but for NEON it is better to concatenate two double-register
+  // size vector operands into a single quad-register size vector.  Do that
+  // transformation here:
+  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
+  //   shuffle(concat(v1, v2), undef)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op0.getNumOperands() != 2 ||
+      Op1.getNumOperands() != 2)
+    return SDValue();
+  SDValue Concat0Op1 = Op0.getOperand(1);
+  SDValue Concat1Op1 = Op1.getOperand(1);
+  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
+      Concat1Op1.getOpcode() != ISD::UNDEF)
+    return SDValue();
+  // Skip the transformation if any of the types are illegal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  if (!TLI.isTypeLegal(VT) ||
+      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
+      !TLI.isTypeLegal(Concat1Op1.getValueType()))
+    return SDValue();
+
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                                  Op0.getOperand(0), Op1.getOperand(0));
+  // Translate the shuffle mask.
+  SmallVector<int, 16> NewMask;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfElts = NumElts/2;
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  for (unsigned n = 0; n < NumElts; ++n) {
+    int MaskElt = SVN->getMaskElt(n);
+    int NewElt = -1;
+    if (MaskElt < (int)HalfElts)
+      NewElt = MaskElt;
+    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
+      NewElt = HalfElts + MaskElt - NumElts;
+    NewMask.push_back(NewElt);
+  }
+  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
+                              DAG.getUNDEF(VT), NewMask.data());
+}
+
  /// PerformVDUPLANECombine - Target-specific dag combine xforms for
  /// ARMISD::VDUPLANE.
  static SDValue PerformVDUPLANECombine(SDNode *N, SelectionDAG &DAG) {
@@ -4823,7 +5022,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
  
      if (VT == MVT::i32 &&
          (EltVT == MVT::i8 || EltVT == MVT::i16) &&
-        TLI.isTypeLegal(Vec.getValueType())) {
+        TLI.isTypeLegal(Vec.getValueType()) &&
+        isa<ConstantSDNode>(Lane)) {
  
        unsigned Opc = 0;
        switch (N->getOpcode()) {
@@ -4936,9 +5136,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::SUB:        return PerformSUBCombine(N, DCI);
    case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
    case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
+  case ISD::AND:        return PerformANDCombine(N, DCI);
    case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
    case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
    case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI.DAG);
+  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
    case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI.DAG);
    case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
    case ISD::SHL:
@@ -5386,6 +5588,40 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
    return TargetLowering::getConstraintType(Constraint);
  }
  
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARMTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  const Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'l':
+    if (type->isIntegerTy()) {
+      if (Subtarget->isThumb())
+        weight = CW_SpecificReg;
+      else
+        weight = CW_Register;
+    }
+    break;
+  case 'w':
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
  std::pair<unsigned, const TargetRegisterClass*>
  ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                  EVT VT) const {