sign_extend_inreg needs to be expanded for pre-v6 Thumb as well as ARM.

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index c8a8ac88c72a60af2db49b259c4118268804200a..7f89b2d1685697782a411b0186b9266316831983 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12,6 +12,7 @@
  //
  //===----------------------------------------------------------------------===//
  
+#define DEBUG_TYPE "arm-isel"
  #include "ARM.h"
  #include "ARMAddressingModes.h"
  #include "ARMConstantPoolValue.h"
@@ -40,6 +41,7 @@
  #include "llvm/MC/MCSectionMachO.h"
  #include "llvm/Target/TargetOptions.h"
  #include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/Statistic.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
@@ -47,11 +49,24 @@
  #include <sstream>
  using namespace llvm;
  
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+// This option should go away when tail calls fully work.
+static cl::opt<bool>
+EnableARMTailCalls("arm-tail-calls", cl::Hidden,
+  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
+  cl::init(true));
+
  static cl::opt<bool>
  EnableARMLongCalls("arm-long-calls", cl::Hidden,
    cl::desc("Generate calls via indirect call instructions."),
    cl::init(false));
  
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+  cl::desc("Enable / disable ARM interworking (for debugging only)"),
+  cl::init(true));
+
  static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags,
@@ -94,10 +109,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
    }
    setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  if (llvm::ModelWithRegSequence())
-    setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-  else
-    setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
    setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
    setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
@@ -363,8 +375,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::BSWAP, MVT::i32, Expand);
  
    // These are expanded into libcalls.
-  setOperationAction(ISD::SDIV,  MVT::i32, Expand);
-  setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  if (!Subtarget->hasDivide()) {
+    // v7M has a hardware divider
+    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  }
    setOperationAction(ISD::SREM,  MVT::i32, Expand);
    setOperationAction(ISD::UREM,  MVT::i32, Expand);
    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
@@ -376,6 +391,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
    setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
  
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
    // Use the default implementation.
    setOperationAction(ISD::VASTART,            MVT::Other, Custom);
    setOperationAction(ISD::VAARG,              MVT::Other, Expand);
@@ -388,9 +405,58 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    // doesn't yet know how to not do that for SjLj.
    setExceptionSelectorRegister(ARM::R0);
    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
-  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
-
-  if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) {
+  // Handle atomics directly for ARMv[67] (except for Thumb1), otherwise
+  // use the default expansion.
+  bool canHandleAtomics =
+    (Subtarget->hasV7Ops() ||
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only()));
+  if (canHandleAtomics) {
+    // membarrier needs custom lowering; the rest are legal and handled
+    // normally.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+  } else {
+    // Set them all for expansion, which will force libcalls.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+  }
+  // 64-bit versions are always libcalls (for now)
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
+
+  // If the subtarget does not have extract instructions, sign_extend_inreg
+  // needs to be expanded. Extract is available in ARM mode on v6 and up,
+  // and on most Thumb2 implementations.
+  if (!Subtarget->hasV6Ops()
+      || (Subtarget->isThumb2() && !Subtarget->hasT2ExtractPack())) {
      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
    }
@@ -454,9 +520,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
    setTargetDAGCombine(ISD::ADD);
    setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::MUL);
  
    setStackPointerRegisterToSaveRestore(ARM::SP);
-  setSchedulingPreference(SchedulingForRegPressure);
+
+  if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2())
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Hybrid);
  
    // FIXME: If-converter should use instruction latency to determine
    // profitability rather than relying on fixed limits.
@@ -521,6 +592,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
    case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
  
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+  
    case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
  
    case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
@@ -565,16 +638,59 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    case ARMISD::VZIP:          return "ARMISD::VZIP";
    case ARMISD::VUZP:          return "ARMISD::VUZP";
    case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
    case ARMISD::FMAX:          return "ARMISD::FMAX";
    case ARMISD::FMIN:          return "ARMISD::FMIN";
    }
  }
  
+/// getRegClassFor - Return the register class that should be used for the
+/// specified value type.
+TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
+  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
+  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
+  // load / store 4 to 8 consecutive D registers.
+  if (Subtarget->hasNEON()) {
+    if (VT == MVT::v4i64)
+      return ARM::QQPRRegisterClass;
+    else if (VT == MVT::v8i64)
+      return ARM::QQQQPRRegisterClass;
+  }
+  return TargetLowering::getRegClassFor(VT);
+}
+
  /// getFunctionAlignment - Return the Log2 alignment of this function.
  unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
    return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1;
  }
  
+Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
+  unsigned NumVals = N->getNumValues();
+  if (!NumVals)
+    return Sched::RegPressure;
+
+  for (unsigned i = 0; i != NumVals; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT.isFloatingPoint() || VT.isVector())
+      return Sched::Latency;
+  }
+
+  if (!N->isMachineOpcode())
+    return Sched::RegPressure;
+
+  // Load are scheduled for latency even if there instruction itinerary
+  // is not available.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  if (TID.mayLoad())
+    return Sched::Latency;
+
+  const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData();
+  if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2)
+    return Sched::Latency;
+  return Sched::RegPressure;
+}
+
  //===----------------------------------------------------------------------===//
  // Lowering Code
  //===----------------------------------------------------------------------===//
@@ -927,8 +1043,24 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               DebugLoc dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const {
-  // ARM target does not yet support tail call optimization.
-  isTailCall = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsSibCall = false;
+  // Temporarily disable tail calls so things don't break.
+  if (!EnableARMTailCalls)
+    isTailCall = false;
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, Ins, DAG);
+    // We don't support GuaranteedTailCallOpt for ARM, only automatically
+    // detected sibcalls.
+    if (isTailCall) {
+      ++NumTailCalls;
+      IsSibCall = true;
+    }
+  }
  
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
@@ -941,9 +1073,14 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    // Get a count of how many bytes are to be pushed on the stack.
    unsigned NumBytes = CCInfo.getNextStackOffset();
  
+  // For tail calls, memory operands are available in our caller's stack.
+  if (IsSibCall)
+    NumBytes = 0;
+
    // Adjust the stack pointer for the new arguments...
    // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
  
    SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
  
@@ -1004,7 +1141,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        }
      } else if (VA.isRegLoc()) {
        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
+    } else if (!IsSibCall) {
        assert(VA.isMemLoc());
  
        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1019,10 +1156,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    // Build a sequence of copy-to-reg nodes chained together with token chain
    // and flag operands which copy the outgoing args into the appropriate regs.
    SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
    }
  
    // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -1031,7 +1190,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    bool isDirect = false;
    bool isARMFunc = false;
    bool isLocalARMFunc = false;
-  MachineFunction &MF = DAG.getMachineFunction();
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  
    if (EnableARMLongCalls) {
@@ -1077,7 +1235,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                     getTargetMachine().getRelocationModel() != Reloc::Static;
      isARMFunc = !Subtarget->isThumb() || isStub;
      // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && !isExt;
+    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
      // tBX takes a register source operand.
      if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
        unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
@@ -1149,9 +1307,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
  
    if (InFlag.getNode())
      Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  if (isTailCall)
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+
    // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
-                      &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
    InFlag = Chain.getValue(1);
  
    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
@@ -1165,6 +1327,193 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                           dl, DAG, InVals);
  }
  
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const ARMInstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+//      unsigned Opcode = Def->getOpcode();
+//      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+//          Def->getOperand(1).isFI()) {
+//        FI = Def->getOperand(1).getIndex();
+//        Bytes = Flags.getByValSize();
+//      } else
+        return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
+  if (isVarArg && !Outs.empty())
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // FIXME: Completely disable sibcal for Thumb1 since Thumb1RegisterInfo::
+  // emitEpilogue is not ready for them.
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  if (isa<ExternalSymbolSDNode>(Callee))
+      return false;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (Subtarget->isThumb1Only())
+      return false;
+
+    // On Thumb, for the moment, we can only do this to functions defined in this
+    // compilation, or to indirect calls.  A Thumb B to an ARM function is not
+    // easily fixed up in the linker, unlike BL.
+    if (Subtarget->isThumb()) {
+      const GlobalValue *GV = G->getGlobal();
+      if (GV->isDeclaration() || GV->isWeakForLinker())
+        return false;
+    }
+  }
+
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
+                    RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, getTargetMachine(),
+                    RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
+                   ArgLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallOperands(Outs,
+                               CCAssignFnForNode(CalleeCC, false, isVarArg));
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const ARMInstrInfo *TII =
+        ((ARMTargetMachine&)getTargetMachine()).getInstrInfo();
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+           i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = Outs[realArgIdx].Val;
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // f64 and vector types are split into multiple registers or
+          // register/stack-slot combinations.  The types will not match
+          // the registers; give up on memory f64 refs until we figure
+          // out what to do about this.
+          if (!VA.isRegLoc())
+            return false;
+          if (!ArgLocs[++i].isRegLoc())
+            return false; 
+          if (RegVT == MVT::v2f64) {
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+          }
+        } else if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
  SDValue
  ARMTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
@@ -1509,10 +1858,24 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
    return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
  }
  
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Val = DAG.getConstant(0, MVT::i32);
+  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0),
+                     Op.getOperand(1), Val);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
+}
+
  SDValue
  ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
-                                           const ARMSubtarget *Subtarget)
-                                             const {
+                                          const ARMSubtarget *Subtarget) const {
    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    DebugLoc dl = Op.getDebugLoc();
    switch (IntNo) {
@@ -1548,35 +1911,25 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
      }
      return Result;
    }
-  case Intrinsic::eh_sjlj_setjmp:
-    SDValue Val = Subtarget->isThumb() ?
-      DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) :
-      DAG.getConstant(0, MVT::i32);
-    return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1),
-                       Val);
    }
  }
  
  static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
-                          const ARMSubtarget *Subtarget) {
+                               const ARMSubtarget *Subtarget) {
    DebugLoc dl = Op.getDebugLoc();
    SDValue Op5 = Op.getOperand(5);
-  SDValue Res;
    unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
-  if (isDeviceBarrier) {
-    if (Subtarget->hasV7Ops())
-      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0));
-    else
-      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0),
-                        DAG.getConstant(0, MVT::i32));
-  } else {
-    if (Subtarget->hasV7Ops())
-      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
-    else
-      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                        DAG.getConstant(0, MVT::i32));
-  }
-  return Res;
+  // v6 and v7 can both handle barriers directly, but need handled a bit
+  // differently. Thumb1 and pre-v6 ARM mode use a libcall instead and should
+  // never get here.
+  unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER;
+  if (Subtarget->hasV7Ops())
+    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0));
+  else if (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())
+    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  assert(0 && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+  return SDValue();
  }
  
  static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -1655,7 +2008,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
      RC = ARM::GPRRegisterClass;
  
    // Transform the arguments stored in physical registers into virtual ones.
-  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 
    SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
  
    SDValue ArgValue2;
@@ -1833,8 +2186,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
          SDValue Store =
            DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), 0,
-                       false, false, 0);
+               PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()),
+               0, false, false, 0);
          MemOps.push_back(Store);
          FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                            DAG.getConstant(4, getPointerTy()));
@@ -2095,9 +2448,31 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
  }
  
+SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       NULL, 0, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(ARM::LR, ARM::GPRRegisterClass); 
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
  SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
    MFI->setFrameAddressIsTaken(true);
+
    EVT VT = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2110,116 +2485,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    return FrameAddr;
  }
  
-SDValue
-ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
-                                           SDValue Chain,
-                                           SDValue Dst, SDValue Src,
-                                           SDValue Size, unsigned Align,
-                                           bool isVolatile, bool AlwaysInline,
-                                           const Value *DstSV,
-                                           uint64_t DstSVOff,
-                                           const Value *SrcSV,
-                                           uint64_t SrcSVOff) const {
-  // Do repeated 4-byte loads and stores. To be improved.
-  // This requires 4-byte alignment.
-  if ((Align & 3) != 0)
-    return SDValue();
-  // This requires the copy size to be a constant, preferrably
-  // within a subtarget-specific limit.
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  if (!ConstantSize)
-    return SDValue();
-  uint64_t SizeVal = ConstantSize->getZExtValue();
-  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
-    return SDValue();
-
-  unsigned BytesLeft = SizeVal & 3;
-  unsigned NumMemOps = SizeVal >> 2;
-  unsigned EmittedNumMemOps = 0;
-  EVT VT = MVT::i32;
-  unsigned VTSize = 4;
-  unsigned i = 0;
-  const unsigned MAX_LOADS_IN_LDM = 6;
-  SDValue TFOps[MAX_LOADS_IN_LDM];
-  SDValue Loads[MAX_LOADS_IN_LDM];
-  uint64_t SrcOff = 0, DstOff = 0;
-
-  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
-  // same number of stores.  The loads and stores will get combined into
-  // ldm/stm later on.
-  while (EmittedNumMemOps < NumMemOps) {
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      Loads[i] = DAG.getLoad(VT, dl, Chain,
-                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
-                                         DAG.getConstant(SrcOff, MVT::i32)),
-                             SrcSV, SrcSVOff + SrcOff, isVolatile, false, 0);
-      TFOps[i] = Loads[i].getValue(1);
-      SrcOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-    for (i = 0;
-         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
-      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                              DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
-                                          DAG.getConstant(DstOff, MVT::i32)),
-                              DstSV, DstSVOff + DstOff, isVolatile, false, 0);
-      DstOff += VTSize;
-    }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-    EmittedNumMemOps += i;
-  }
-
-  if (BytesLeft == 0)
-    return Chain;
-
-  // Issue loads / stores for the trailing (1 - 3) bytes.
-  unsigned BytesLeftSave = BytesLeft;
-  i = 0;
-  while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
-    Loads[i] = DAG.getLoad(VT, dl, Chain,
-                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
-                                       DAG.getConstant(SrcOff, MVT::i32)),
-                           SrcSV, SrcSVOff + SrcOff, false, false, 0);
-    TFOps[i] = Loads[i].getValue(1);
-    ++i;
-    SrcOff += VTSize;
-    BytesLeft -= VTSize;
-  }
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-
-  i = 0;
-  BytesLeft = BytesLeftSave;
-  while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
-    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
-                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
-                                        DAG.getConstant(DstOff, MVT::i32)),
-                            DstSV, DstSVOff + DstOff, false, false, 0);
-    ++i;
-    DstOff += VTSize;
-    BytesLeft -= VTSize;
-  }
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
-}
-
  /// ExpandBIT_CONVERT - If the target supports VFP, this function is called to
  /// expand a bit convert where either the source or destination type is i64 to
  /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -2243,7 +2508,8 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
                               DAG.getConstant(0, MVT::i32));
      SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                               DAG.getConstant(1, MVT::i32));
-    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT,
+                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
    }
  
    // Turn f64->i64 into VMOVRRD.
@@ -2437,9 +2703,9 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
  
    // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
-                             DAG.getConstant(0, MVT::i32));
+                           DAG.getConstant(0, MVT::i32));
    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
-                             DAG.getConstant(1, MVT::i32));
+                           DAG.getConstant(1, MVT::i32));
  
    // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
    // captures the result into a carry flag.
@@ -2553,76 +2819,152 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
    return Result;
  }
  
-/// isVMOVSplat - Check if the specified splat value corresponds to an immediate
-/// VMOV instruction, and if so, return the constant being splatted.
-static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
-                           unsigned SplatBitSize, SelectionDAG &DAG) {
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV).  If so, return either the constant being
+/// splatted or the encoded value, depending on the DoEncode parameter.  The
+/// format of the encoded value is: bit12=Op, bits11-8=Cmode,
+/// bits7-0=Immediate.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 bool isVMOV, bool DoEncode) {
+  unsigned Op, Cmode, Imm;
+  EVT VT;
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
+  Op = 0;
    switch (SplatBitSize) {
    case 8:
-    // Any 1-byte value is OK.
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
      assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    return DAG.getTargetConstant(SplatBits, MVT::i8);
+    Cmode = 0xe;
+    Imm = SplatBits;
+    VT = MVT::i8;
+    break;
  
    case 16:
      // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
-    if ((SplatBits & ~0xff) == 0 ||
-        (SplatBits & ~0xff00) == 0)
-      return DAG.getTargetConstant(SplatBits, MVT::i16);
-    break;
+    VT = MVT::i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      Cmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      Cmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
  
    case 32:
      // NEON's 32-bit VMOV supports splat values where:
      // * only one byte is nonzero, or
      // * the least significant byte is 0xff and the second byte is nonzero, or
      // * the least significant 2 bytes are 0xff and the third is nonzero.
-    if ((SplatBits & ~0xff) == 0 ||
-        (SplatBits & ~0xff00) == 0 ||
-        (SplatBits & ~0xff0000) == 0 ||
-        (SplatBits & ~0xff000000) == 0)
-      return DAG.getTargetConstant(SplatBits, MVT::i32);
+    VT = MVT::i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      Cmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      Cmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      Cmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      Cmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
  
      if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff)
-      return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32);
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      Cmode = 0xc;
+      Imm = SplatBits >> 8;
+      SplatBits |= 0xff;
+      break;
+    }
  
      if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff)
-      return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32);
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      Cmode = 0xd;
+      Imm = SplatBits >> 16;
+      SplatBits |= 0xffff;
+      break;
+    }
  
      // Note: there are a few 32-bit splat values (specifically: 00ffff00,
      // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
      // VMOV.I32.  A (very) minor optimization would be to replicate the value
      // and fall through here to test for a valid 64-bit splat.  But, then the
      // caller would also need to check and handle the change in size.
-    break;
+    return SDValue();
  
    case 64: {
      // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    if (!isVMOV)
+      return SDValue();
      uint64_t BitMask = 0xff;
      uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
      for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask)
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
          Val |= BitMask;
-      else if ((SplatBits & BitMask) != 0)
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
          return SDValue();
+      }
        BitMask <<= 8;
+      ImmMask <<= 1;
      }
-    return DAG.getTargetConstant(Val, MVT::i64);
+    // Op=1, Cmode=1110.
+    Op = 1;
+    Cmode = 0xe;
+    SplatBits = Val;
+    VT = MVT::i64;
+    break;
    }
  
    default:
-    llvm_unreachable("unexpected size for isVMOVSplat");
-    break;
+    llvm_unreachable("unexpected size for isNEONModifiedImm");
+    return SDValue();
    }
  
-  return SDValue();
+  if (DoEncode)
+    return DAG.getTargetConstant((Op << 12) | (Cmode << 8) | Imm, MVT::i32);
+  return DAG.getTargetConstant(SplatBits, VT);
  }
  
-/// getVMOVImm - If this is a build_vector of constants which can be
-/// formed by using a VMOV instruction of the specified element size,
-/// return the constant being splatted.  The ByteSize field indicates the
-/// number of bytes of each element [1248].
-SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+
+/// getNEONModImm - If this is a valid vector constant for a NEON instruction
+/// with a "modified immediate" operand (e.g., VMOV) of the specified element
+/// size, return the encoded value for that immediate.  The ByteSize field
+/// indicates the number of bytes of each element [1248].
+SDValue ARM::getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV,
+                           SelectionDAG &DAG) {
    BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
    APInt SplatBits, SplatUndef;
    unsigned SplatBitSize;
@@ -2634,8 +2976,8 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
    if (SplatBitSize > ByteSize * 8)
      return SDValue();
  
-  return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                     SplatBitSize, DAG);
+  return isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                           SplatBitSize, DAG, isVMOV, true);
  }
  
  static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
@@ -2875,28 +3217,65 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    bool HasAnyUndefs;
    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
      if (SplatBitSize <= 64) {
-      SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
-                                SplatUndef.getZExtValue(), SplatBitSize, DAG);
+      // Check if an immediate VMOV works.
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(),
+                                      SplatBitSize, DAG, true, false);
        if (Val.getNode())
          return BuildSplat(Val, VT, DAG, dl);
      }
    }
  
-  // If there are only 2 elements in a 128-bit vector, insert them into an
-  // undef vector.  This handles the common case for 128-bit vector argument
-  // passing, where the insertions should be translated to subreg accesses
-  // with no real instructions.
-  if (VT.is128BitVector() && Op.getNumOperands() == 2) {
-    SDValue Val = DAG.getUNDEF(VT);
-    SDValue Op0 = Op.getOperand(0);
-    SDValue Op1 = Op.getOperand(1);
-    if (Op0.getOpcode() != ISD::UNDEF)
-      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0,
-                        DAG.getIntPtrConstant(0));
-    if (Op1.getOpcode() != ISD::UNDEF)
-      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1,
-                        DAG.getIntPtrConstant(1));
-    return Val;
+  // Scan through the operands to see if only one value is used.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool isConstant = true;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (!Value.getNode())
+      Value = V;
+    else if (V != Value)
+      usesOnlyOneValue = false;
+  }
+
+  if (!Value.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  // If all elements are constants, fall back to the default expansion, which
+  // will generate a load from the constant pool.
+  if (isConstant)
+    return SDValue();
+
+  // Use VDUP for non-constant splats.
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (usesOnlyOneValue && EltSize <= 32)
+    return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+  // Vectors with 32- or 64-bit elements can be built by directly assigning
+  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
+  // will be legalized.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i)));
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
    }
  
    return SDValue();
@@ -2932,7 +3311,9 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    bool ReverseVEXT;
    unsigned Imm, WhichResult;
  
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return (EltSize >= 32 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isVREVMask(M, VT, 64) ||
            isVREVMask(M, VT, 32) ||
            isVREVMask(M, VT, 16) ||
@@ -3030,64 +3411,67 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    // of the same time so that they get CSEd properly.
    SVN->getMask(ShuffleMask);
  
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1) Lane = 0;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize <= 32) {
+    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+      int Lane = SVN->getSplatIndex();
+      // If this is undef splat, generate it via "just" vdup, if possible.
+      if (Lane == -1) Lane = 0;
  
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-      return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                         DAG.getConstant(Lane, MVT::i32));
      }
-    return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
-                       DAG.getConstant(Lane, MVT::i32));
-  }
  
-  bool ReverseVEXT;
-  unsigned Imm;
-  if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
-    if (ReverseVEXT)
-      std::swap(V1, V2);
-    return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  if (isVREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
-  if (isVREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
-  if (isVREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
-
-  // Check for Neon shuffles that modify both input vectors in place.
-  // If both results are used, i.e., if there are two shuffles with the same
-  // source operands and with masks corresponding to both results of one of
-  // these operations, DAG memoization will ensure that a single node is
-  // used for both shuffles.
-  unsigned WhichResult;
-  if (isVTRNMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
-  if (isVUZPMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
-  if (isVZIPMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
+    bool ReverseVEXT;
+    unsigned Imm;
+    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+      if (ReverseVEXT)
+        std::swap(V1, V2);
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
  
-  if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
-  if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
-  if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
+    if (isVREVMask(ShuffleMask, VT, 64))
+      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 32))
+      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 16))
+      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+    // Check for Neon shuffles that modify both input vectors in place.
+    // If both results are used, i.e., if there are two shuffles with the same
+    // source operands and with masks corresponding to both results of one of
+    // these operations, DAG memoization will ensure that a single node is
+    // used for both shuffles.
+    unsigned WhichResult;
+    if (isVTRNMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVUZPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVZIPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+
+    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+  }
  
    // If the shuffle is not directly supported and it has 4 elements, use
    // the PerfectShuffle-generated table to synthesize it from other shuffles.
-  if (VT.getVectorNumElements() == 4 &&
-      (VT.is128BitVector() || VT.is64BitVector())) {
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
      unsigned PFIndexes[4];
      for (unsigned i = 0; i != 4; ++i) {
        if (ShuffleMask[i] < 0)
@@ -3099,7 +3483,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      // Compute the index in the perfect shuffle table.
      unsigned PFTableIndex =
        PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
-
      unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
      unsigned Cost = (PFEntry >> 30);
  
@@ -3107,6 +3490,28 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
        return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
    }
  
+  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1);
+    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      if (ShuffleMask[i] < 0)
+        Ops.push_back(DAG.getUNDEF(EltVT));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
+                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+                                                  MVT::i32)));
+    }
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
+  }
+
    return SDValue();
  }
  
@@ -3161,9 +3566,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::FP_TO_SINT:
    case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
    case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
-  case ISD::RETURNADDR:    break;
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
    case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
    case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                 Subtarget);
    case ISD::BIT_CONVERT:   return ExpandBIT_CONVERT(Op.getNode(), DAG);
@@ -3670,6 +4077,62 @@ static SDValue PerformSUBCombine(SDNode *N,
    return SDValue();
  }
  
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
+  if (Subtarget->isThumb1Only())
+    return SDValue();
+
+  if (DAG.getMachineFunction().
+      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  uint64_t MulAmt = C->getZExtValue();
+  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
+  ShiftAmt = ShiftAmt & (32 - 1);
+  SDValue V = N->getOperand(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue Res;
+  MulAmt >>= ShiftAmt;
+  if (isPowerOf2_32(MulAmt - 1)) {
+    // (mul x, 2^N + 1) => (add (shl x, N), x)
+    Res = DAG.getNode(ISD::ADD, DL, VT,
+                      V, DAG.getNode(ISD::SHL, DL, VT,
+                                     V, DAG.getConstant(Log2_32(MulAmt-1),
+                                                        MVT::i32)));
+  } else if (isPowerOf2_32(MulAmt + 1)) {
+    // (mul x, 2^N - 1) => (sub (shl x, N), x)
+    Res = DAG.getNode(ISD::SUB, DL, VT,
+                      DAG.getNode(ISD::SHL, DL, VT,
+                                  V, DAG.getConstant(Log2_32(MulAmt+1),
+                                                     MVT::i32)),
+                                                     V);
+  } else
+    return SDValue();
+
+  if (ShiftAmt != 0)
+    Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                      DAG.getConstant(ShiftAmt, MVT::i32));
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, Res, false);
+  return SDValue();
+}
+
  /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
  /// ARMISD::VMOVRRD.
  static SDValue PerformVMOVRRDCombine(SDNode *N,
@@ -3811,7 +4274,8 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
        // Narrowing shifts require an immediate right shift.
        if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
          break;
-      llvm_unreachable("invalid shift count for narrowing vector shift intrinsic");
+      llvm_unreachable("invalid shift count for narrowing vector shift "
+                       "intrinsic");
  
      default:
        llvm_unreachable("unhandled vector shift");
@@ -4056,6 +4520,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    default: break;
    case ISD::ADD:        return PerformADDCombine(N, DCI);
    case ISD::SUB:        return PerformSUBCombine(N, DCI);
+  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
    case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
    case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
    case ISD::SHL:
@@ -4435,9 +4900,11 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
    bool isSEXTLoad = false;
    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
      VT  = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
      isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
    } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
      VT  = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
    } else
      return false;
  
@@ -4445,13 +4912,25 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
    bool isLegal = false;
    if (Subtarget->isThumb2())
      isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
-                                        isInc, DAG);
+                                       isInc, DAG);
    else
      isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
                                          isInc, DAG);
    if (!isLegal)
      return false;
  
+  if (Ptr != Base) {
+    // Swap base ptr and offset to catch more post-index load / store when
+    // it's legal. In Thumb2 mode, offset must be an immediate.
+    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
+        !Subtarget->isThumb2())
+      std::swap(Base, Offset);
+
+    // Post-indexed load / store update the base pointer.
+    if (Ptr != Base)
+      return false;
+  }
+
    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
    return true;
  }
@@ -4522,7 +5001,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
      }
    }
    if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(0U, ARM::CCRRegisterClass);
+    return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass);
  
    return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
  }