X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=7df13834055704e8c397a094de12b2e2f0f83410;hb=dc73dc09f135cb1d36fd18f1918395ddf68cec63;hp=d63557a34ab2d52c22fc93042fc5578f0c663b27;hpb=8e673570b9e6f7d3f4c30d0436062d6dff8aef25;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index d63557a34ab..e01d26a67d8 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -23,6 +23,7 @@ #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -40,6 +41,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCSectionMachO.h" @@ -47,6 +49,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include using namespace llvm; @@ -57,11 +60,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); -cl::opt -EnableARMLongCalls("arm-long-calls", cl::Hidden, - cl::desc("Generate calls via indirect call instructions"), - cl::init(false)); - static cl::opt ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), @@ -144,6 +142,11 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + + if (VT.isInteger()) { + setOperationAction(ISD::SABSDIFF, VT, Legal); + setOperationAction(ISD::UABSDIFF, VT, Legal); + } } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { @@ -156,26 +159,18 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } -static TargetLoweringObjectFile *createTLOF(const Triple &TT) { - if (TT.isOSBinFormatMachO()) - return new TargetLoweringObjectFileMachO(); - if (TT.isOSWindows()) - return new TargetLoweringObjectFileCOFF(); - return new ARMElfTargetObjectFile(); -} - -ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { - Subtarget = &TM.getSubtarget(); - RegInfo = TM.getSubtargetImpl()->getRegisterInfo(); - Itins = TM.getSubtargetImpl()->getInstrItineraryData(); +ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, + const ARMSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + RegInfo = Subtarget->getRegisterInfo(); + Itins = Subtarget->getInstrItineraryData(); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && - Subtarget->hasARMOps() && !TM.Options.UseSoftFloat) { + Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { // Single-precision floating-point arithmetic. setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); @@ -406,33 +401,34 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else addRegisterClass(MVT::i32, &ARM::GPRRegClass); - if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); } - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } - setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); + setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); + setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); @@ -552,6 +548,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + // NEON does not have single instruction CTTZ for vectors. + setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); @@ -575,15 +592,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. - MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8, - MVT::v4i16, MVT::v2i16, - MVT::v2i32}; - for (unsigned i = 0; i < 6; ++i) { - setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal); - setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal); - setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal); + for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, + MVT::v2i32}) { + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); + } } } @@ -621,15 +639,23 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setOperationAction(ISD::FRINT, MVT::f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); setOperationAction(ISD::FFLOOR, MVT::f64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); } - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + } // ... or truncating stores setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -637,7 +663,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setTruncStoreAction(MVT::f64, MVT::f16, Expand); // ARM does not have i1 sign extending load. - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // ARM supports all 4 flavors of integer indexed load / store. if (!Subtarget->isThumb1Only()) { @@ -817,7 +844,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. @@ -827,11 +854,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget->isTargetDarwin()) { - setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); - setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); + if (Subtarget->isTargetDarwin()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); - } setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); @@ -858,7 +885,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); - if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); @@ -872,15 +899,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) } // Various VFP goodness - if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) { - // int <-> fp are custom expanded into bit_convert + ARMISD ops. - if (Subtarget->hasVFP2()) { - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); - } - + if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -937,7 +956,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setStackPointerRegisterToSaveRestore(ARM::SP); - if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() || + if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) setSchedulingPreference(Sched::RegPressure); else @@ -961,6 +980,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } +bool ARMTargetLowering::useSoftFloat() const { + return Subtarget->useSoftFloat(); +} + // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, @@ -971,13 +994,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM) // of the difficulty prior to coalescing of modeling operand register classes // due to the common occurrence of cross class copies and subregister insertions // and extractions. -std::pair -ARMTargetLowering::findRepresentativeClass(MVT VT) const{ +std::pair +ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: - return TargetLowering::findRepresentativeClass(VT); + return TargetLowering::findRepresentativeClass(TRI, VT); // Use DPR as representative register class for all floating point // and vector types. Since there are 32 SPR registers and 32 DPR registers so // the cost is 1 for both f32 and f64. @@ -1009,11 +1033,12 @@ ARMTargetLowering::findRepresentativeClass(MVT VT) const{ } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: return nullptr; + switch ((ARMISD::NodeType)Opcode) { + case ARMISD::FIRST_NUMBER: break; case ARMISD::Wrapper: return "ARMISD::Wrapper"; case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; + case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; @@ -1036,11 +1061,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::RBIT: return "ARMISD::RBIT"; - case ARMISD::FTOSI: return "ARMISD::FTOSI"; - case ARMISD::FTOUI: return "ARMISD::FTOUI"; - case ARMISD::SITOF: return "ARMISD::SITOF"; - case ARMISD::UITOF: return "ARMISD::UITOF"; - case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1054,7 +1074,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; - case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; @@ -1095,6 +1116,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; + case ARMISD::VSLI: return "ARMISD::VSLI"; + case ARMISD::VSRI: return "ARMISD::VSRI"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; @@ -1145,10 +1168,13 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; } + return nullptr; } -EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) return getPointerTy(); +EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return getPointerTy(DL); return VT.changeVectorElementTypeToInteger(); } @@ -1167,6 +1193,20 @@ const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { return TargetLowering::getRegClassFor(VT); } +// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the +// source/dest is aligned and the copy size is large enough. We therefore want +// to align such objects passed to memory intrinsics. +bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, + unsigned &PrefAlign) const { + if (!isa(CI)) + return false; + MinSize = 8; + // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 + // cycle faster than 4-byte aligned LDM. + PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); + return true; +} + // Create a fast isel object. FastISel * ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, @@ -1174,12 +1214,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, return ARM::createFastISel(funcInfo, libInfo); } -/// getMaximalGlobalOffset - Returns the maximal possible offset which can -/// be used for loads / stores from the global. -unsigned ARMTargetLowering::getMaximalGlobalOffset() const { - return (Subtarget->isThumb1Only() ? 127 : 4095); -} - Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { unsigned NumVals = N->getNumValues(); if (!NumVals) @@ -1198,8 +1232,7 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { // Load are scheduled for latency even if there instruction itinerary // is not available. - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); if (MCID.getNumDefs() == 0) @@ -1374,7 +1407,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, if (VA.getLocVT() == MVT::v2f64) { SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); VA = RVLocs[++i]; // skip ahead to next loc Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); @@ -1388,7 +1421,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, dl, MVT::i32)); } } else { Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), @@ -1419,8 +1452,9 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); - PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); return DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(LocMemOffset), false, false, 0); @@ -1444,7 +1478,8 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, else { assert(NextVA.isMemLoc()); if (!StackPtr.getNode()) - StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, + getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), dl, DAG, NextVA, @@ -1459,7 +1494,7 @@ SDValue ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; - SDLoc &dl = CLI.DL; + SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; @@ -1474,9 +1509,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; bool isSibCall = false; + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); // Disable tail calls if they're not supported. - if (!Subtarget->supportsTailCall() || MF.getTarget().Options.DisableTailCalls) + if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; if (isTailCall) { @@ -1513,10 +1549,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getIntPtrConstant(NumBytes, dl, true), dl); - SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); + SDValue StackPtr = + DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); RegsToPassVector RegsToPass; SmallVector MemOpChains; @@ -1553,9 +1590,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, dl, MVT::i32)); PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); @@ -1590,17 +1627,18 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // True if this byval aggregate will be split between registers // and memory. unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); - unsigned CurByValIdx = CCInfo.getInRegsParamsProceed(); + unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); if (CurByValIdx < ByValArgsCount) { unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { - SDValue Const = DAG.getConstant(4*i, MVT::i32); + SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), @@ -1618,15 +1656,16 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (Flags.getByValSize() > 4*offset) { + auto PtrVT = getPointerTy(DAG.getDataLayout()); unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset); - SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, - StkPtrOff); - SDValue SrcOffset = DAG.getIntPtrConstant(4*offset); - SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset); - SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, + SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); + SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); + SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); - SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32); + SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, + MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; @@ -1682,8 +1721,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isARMFunc = false; bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); + auto PtrVt = getPointerTy(DAG.getDataLayout()); - if (EnableARMLongCalls) { + if (Subtarget->genLongCalls()) { assert((Subtarget->isTargetWindows() || getTargetMachine().getRelocationModel() == Reloc::Static) && "long-calls with non-static relocation model!"); @@ -1698,12 +1738,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), false, false, + false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); @@ -1713,29 +1752,28 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), false, false, + false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); isDirect = true; - bool isExt = GV->isDeclaration() || GV->isWeakForLinker(); - bool isStub = (isExt && Subtarget->isTargetMachO()) && + bool isDef = GV->isStrongDefinitionForLinker(); + bool isStub = (!isDef && Subtarget->isTargetMachO()) && getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); // ARM call to a local ARM function is predicable. - isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); + isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); // tBX takes a register source operand. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); - Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(), - DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), - 0, ARMII::MO_NONLAZY)); - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, + Callee = DAG.getNode( + ARMISD::WrapperPIC, dl, PtrVt, + DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), false, false, true, 0); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && @@ -1743,20 +1781,20 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), /*Offset=*/0, - TargetFlags); + Callee = + DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) - Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ARMISD::Wrapper, dl, getPointerTy(), - Callee), MachinePointerInfo::getGOT(), - false, false, false, 0); + Callee = + DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), + DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), + MachinePointerInfo::getGOT(), false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, OpFlags); } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; @@ -1770,29 +1808,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(getPointerTy(), dl, - DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); - Callee = DAG.getNode(ARMISD::PIC_ADD, dl, - getPointerTy(), Callee, PICLabel); + Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(), false, false, + false, 0); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { unsigned OpFlags = 0; // On ELF targets for PIC code, direct calls should go through the PLT if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) OpFlags = ARMII::MO_PLT; - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, OpFlags); } } // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); + bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -1823,21 +1858,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. if (!isTailCall) { const uint32_t *Mask; - const TargetRegisterInfo *TRI = - getTargetMachine().getSubtargetImpl()->getRegisterInfo(); - const ARMBaseRegisterInfo *ARI = static_cast(TRI); + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); if (isThisReturn) { // For 'this' returns, use the R0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(CallConv); + Mask = ARI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { // Set isThisReturn to false if the calling convention is not one that // allows 'returned' to be modeled in this way, so LowerCallResult does // not try to pass 'this' straight through isThisReturn = false; - Mask = ARI->getCallPreservedMask(CallConv); + Mask = ARI->getCallPreservedMask(MF, CallConv); } } else - Mask = ARI->getCallPreservedMask(CallConv); + Mask = ARI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -1847,15 +1880,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - if (isTailCall) + if (isTailCall) { + MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); + } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -1870,58 +1905,58 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// on the stack. Remember the next parameter register to allocate, /// and then confiscate the rest of the parameter registers to insure /// this. -void -ARMTargetLowering::HandleByVal( - CCState *State, unsigned &size, unsigned Align) const { - unsigned reg = State->AllocateReg(GPRArgRegs, 4); +void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, + unsigned Align) const { assert((State->getCallOrPrologue() == Prologue || State->getCallOrPrologue() == Call) && "unhandled ParmContext"); - if ((ARM::R0 <= reg) && (reg <= ARM::R3)) { - if (Subtarget->isAAPCS_ABI() && Align > 4) { - unsigned AlignInRegs = Align / 4; - unsigned Waste = (ARM::R4 - reg) % AlignInRegs; - for (unsigned i = 0; i < Waste; ++i) - reg = State->AllocateReg(GPRArgRegs, 4); - } - if (reg != 0) { - unsigned excess = 4 * (ARM::R4 - reg); - - // Special case when NSAA != SP and parameter size greater than size of - // all remained GPR regs. In that case we can't split parameter, we must - // send it to stack. We also must set NCRN to R4, so waste all - // remained registers. - const unsigned NSAAOffset = State->getNextStackOffset(); - if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) { - while (State->AllocateReg(GPRArgRegs, 4)) - ; - return; - } + // Byval (as with any stack) slots are always at least 4 byte aligned. + Align = std::max(Align, 4U); - // First register for byval parameter is the first register that wasn't - // allocated before this method call, so it would be "reg". - // If parameter is small enough to be saved in range [reg, r4), then - // the end (first after last) register would be reg + param-size-in-regs, - // else parameter would be splitted between registers and stack, - // end register would be r4 in this case. - unsigned ByValRegBegin = reg; - unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4; - State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); - // Note, first register is allocated in the beginning of function already, - // allocate remained amount of registers we need. - for (unsigned i = reg+1; i != ByValRegEnd; ++i) - State->AllocateReg(GPRArgRegs, 4); - // A byval parameter that is split between registers and memory needs its - // size truncated here. - // In the case where the entire structure fits in registers, we set the - // size in memory to zero. - if (size < excess) - size = 0; - else - size -= excess; - } + unsigned Reg = State->AllocateReg(GPRArgRegs); + if (!Reg) + return; + + unsigned AlignInRegs = Align / 4; + unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; + for (unsigned i = 0; i < Waste; ++i) + Reg = State->AllocateReg(GPRArgRegs); + + if (!Reg) + return; + + unsigned Excess = 4 * (ARM::R4 - Reg); + + // Special case when NSAA != SP and parameter size greater than size of + // all remained GPR regs. In that case we can't split parameter, we must + // send it to stack. We also must set NCRN to R4, so waste all + // remained registers. + const unsigned NSAAOffset = State->getNextStackOffset(); + if (NSAAOffset != 0 && Size > Excess) { + while (State->AllocateReg(GPRArgRegs)) + ; + return; } + + // First register for byval parameter is the first register that wasn't + // allocated before this method call, so it would be "reg". + // If parameter is small enough to be saved in range [reg, r4), then + // the end (first after last) register would be reg + param-size-in-regs, + // else parameter would be splitted between registers and stack, + // end register would be r4 in this case. + unsigned ByValRegBegin = Reg; + unsigned ByValRegEnd = std::min(Reg + Size / 4, ARM::R4); + State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); + // Note, first register is allocated in the beginning of function already, + // allocate remained amount of registers we need. + for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) + State->AllocateReg(GPRArgRegs); + // A byval parameter that is split between registers and memory needs its + // size truncated here. + // In the case where the entire structure fits in registers, we set the + // size in memory to zero. + Size = std::max(Size - Excess, 0); } /// MatchingStackOffset - Return true if the given stack call argument is @@ -2004,7 +2039,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: + // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation // support in the assembler and linker to be used. This would need to be @@ -2033,7 +2068,9 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); - if (GV->hasExternalWeakLinkage()) + const Triple &TT = getTargetMachine().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } @@ -2092,8 +2129,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { @@ -2168,7 +2204,8 @@ static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, report_fatal_error("Unsupported interrupt attribute. If present, value " "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); - RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false)); + RetOps.insert(RetOps.begin() + 1, + DAG.getConstant(LROffset, DL, MVT::i32, false)); return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); } @@ -2221,7 +2258,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); @@ -2240,7 +2277,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain, // Extract the 2nd half and fall through to handle it as an f64 value. Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, dl, MVT::i32)); } // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is // available. @@ -2364,12 +2401,32 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; - if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; return !Subtarget->isThumb1Only(); } +// Trying to write a 64 bit value so need to split into two 32 bit values first, +// and pass the lower and high parts through. +static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue WriteValue = Op->getOperand(2); + + // This function is only supposed to be called for i64 type argument. + assert(WriteValue.getValueType() == MVT::i64 + && "LowerWRITE_REGISTER called for non-i64 type argument."); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, + DAG.getConstant(1, DL, MVT::i32)); + SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; + return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); +} + // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise @@ -2401,7 +2458,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; SDLoc DL(Op); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast(Op)->getBlockAddress(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; @@ -2421,7 +2478,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, false, false, false, 0); if (RelocM == Reloc::Static) return Result; - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); } @@ -2430,7 +2487,7 @@ SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { SDLoc dl(GA); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -2445,7 +2502,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, false, false, false, 0); SDValue Chain = Argument.getValue(1); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); // call __tls_get_addr. @@ -2476,7 +2533,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); @@ -2497,7 +2554,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, false, false, false, 0); Chain = Offset.getValue(1); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, @@ -2526,6 +2583,8 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); GlobalAddressSDNode *GA = cast(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); @@ -2542,7 +2601,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { @@ -2585,7 +2644,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); @@ -2616,7 +2675,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, const GlobalValue *GV = cast(Op)->getGlobal(); const ARMII::TOF TargetFlags = (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); @@ -2640,7 +2699,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = @@ -2651,14 +2710,14 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(), false, false, false, 0); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - SDValue Val = DAG.getConstant(0, MVT::i32); + SDValue Val = DAG.getConstant(0, dl, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1), Val); @@ -2668,7 +2727,14 @@ SDValue ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), - Op.getOperand(1), DAG.getConstant(0, MVT::i32)); + Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); +} + +SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, + Op.getOperand(0)); } SDValue @@ -2684,14 +2750,14 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1)); } case Intrinsic::arm_thread_pointer: { - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); SDValue CPAddr; unsigned PCAdj = (RelocM != Reloc::PIC_) @@ -2707,7 +2773,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, false, false, false, 0); if (RelocM == Reloc::PIC_) { - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } return Result; @@ -2733,7 +2799,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); } ConstantSDNode *OrdN = cast(Op.getOperand(1)); @@ -2750,8 +2816,8 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, } return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(Intrinsic::arm_dmb, MVT::i32), - DAG.getConstant(Domain, MVT::i32)); + DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), + DAG.getConstant(Domain, dl, MVT::i32)); } static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, @@ -2777,8 +2843,8 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, } return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), - Op.getOperand(1), DAG.getConstant(isRead, MVT::i32), - DAG.getConstant(isData, MVT::i32)); + Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), + DAG.getConstant(isData, dl, MVT::i32)); } static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { @@ -2788,7 +2854,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc dl(Op); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), @@ -2818,7 +2884,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); @@ -2831,55 +2897,6 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } -void -ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF, - unsigned InRegsParamRecordIdx, - unsigned ArgSize, - unsigned &ArgRegsSize, - unsigned &ArgRegsSaveSize) - const { - unsigned NumGPRs; - if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { - unsigned RBegin, REnd; - CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); - NumGPRs = REnd - RBegin; - } else { - unsigned int firstUnalloced; - firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs, - sizeof(GPRArgRegs) / - sizeof(GPRArgRegs[0])); - NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0; - } - - unsigned Align = MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment(); - ArgRegsSize = NumGPRs * 4; - - // If parameter is split between stack and GPRs... - if (NumGPRs && Align > 4 && - (ArgRegsSize < ArgSize || - InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) { - // Add padding for part of param recovered from GPRs. For example, - // if Align == 8, its last byte must be at address K*8 - 1. - // We need to do it, since remained (stack) part of parameter has - // stack alignment, and we need to "attach" "GPRs head" without gaps - // to it: - // Stack: - // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes... - // [ [padding] [GPRs head] ] [ Tail passed via stack .... - // - ARMFunctionInfo *AFI = MF.getInfo(); - unsigned Padding = - OffsetToAlignment(ArgRegsSize + AFI->getArgRegsSaveSize(), Align); - ArgRegsSaveSize = ArgRegsSize + Padding; - } else - // We don't need to extend regs save size for byval parameters if they - // are passed via GPRs only. - ArgRegsSaveSize = ArgRegsSize; -} - // The remaining GPRs hold either the beginning of variable-argument // data, or the beginning of an aggregate passed by value (usually // byval). Either way, we allocate stack slots adjacent to the data @@ -2893,13 +2910,8 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, SDLoc dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, - unsigned OffsetFromOrigArg, - unsigned ArgOffset, - unsigned ArgSize, - bool ForceMutable, - unsigned ByValStoreOffset, - unsigned TotalArgRegsSaveSize) const { - + int ArgOffset, + unsigned ArgSize) const { // Currently, two use-cases possible: // Case #1. Non-var-args function, and we meet first byval parameter. // Setup first unallocated register as first byval register; @@ -2914,83 +2926,39 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); - unsigned firstRegToSaveIndex, lastRegToSaveIndex; unsigned RBegin, REnd; if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); - firstRegToSaveIndex = RBegin - ARM::R0; - lastRegToSaveIndex = REnd - ARM::R0; } else { - firstRegToSaveIndex = CCInfo.getFirstUnallocated - (GPRArgRegs, array_lengthof(GPRArgRegs)); - lastRegToSaveIndex = 4; - } - - unsigned ArgRegsSize, ArgRegsSaveSize; - computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize, - ArgRegsSize, ArgRegsSaveSize); - - // Store any by-val regs to their spots on the stack so that they may be - // loaded by deferencing the result of formal parameter pointer or va_next. - // Note: once stack area for byval/varargs registers - // was initialized, it can't be initialized again. - if (ArgRegsSaveSize) { - unsigned Padding = ArgRegsSaveSize - ArgRegsSize; - - if (Padding) { - assert(AFI->getStoredByValParamsPadding() == 0 && - "The only parameter may be padded."); - AFI->setStoredByValParamsPadding(Padding); - } - - int FrameIndex = MFI->CreateFixedObject(ArgRegsSaveSize, - Padding + - ByValStoreOffset - - (int64_t)TotalArgRegsSaveSize, - false); - SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy()); - if (Padding) { - MFI->CreateFixedObject(Padding, - ArgOffset + ByValStoreOffset - - (int64_t)ArgRegsSaveSize, - false); - } - - SmallVector MemOps; - for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex; - ++firstRegToSaveIndex, ++i) { - const TargetRegisterClass *RC; - if (AFI->isThumb1OnlyFunction()) - RC = &ARM::tGPRRegClass; - else - RC = &ARM::GPRRegClass; + unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); + RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; + REnd = ARM::R4; + } - unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i), - false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, - DAG.getConstant(4, getPointerTy())); - } + if (REnd != RBegin) + ArgOffset = -4 * (ARM::R4 - RBegin); - AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + int FrameIndex = MFI->CreateFixedObject(ArgSize, ArgOffset, false); + SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - return FrameIndex; - } else { - if (ArgSize == 0) { - // We cannot allocate a zero-byte object for the first variadic argument, - // so just make up a size. - ArgSize = 4; - } - // This will point to the next argument passed via stack. - return MFI->CreateFixedObject( - ArgSize, ArgOffset, !ForceMutable); + SmallVector MemOps; + const TargetRegisterClass *RC = + AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; + + for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { + unsigned VReg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(OrigArg, 4 * i), false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + return FrameIndex; } // Setup stack frame, the va_list pointer will start from. @@ -3008,11 +2976,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, // the result of va_next. // If there is no regs to be stored, just point address after last // argument passed via stack. - int FrameIndex = - StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, - CCInfo.getInRegsParamsCount(), 0, ArgOffset, 0, ForceMutable, - 0, TotalArgRegsSaveSize); - + int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, + CCInfo.getInRegsParamsCount(), + CCInfo.getNextStackOffset(), 4); AFI->setVarArgsFrameIndex(FrameIndex); } @@ -3038,7 +3004,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, isVarArg)); SmallVector ArgValues; - int lastInsIndex = -1; SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); unsigned CurArgIdx = 0; @@ -3048,55 +3013,49 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // We also increase this value in case of varargs function. AFI->setArgRegsSaveSize(0); - unsigned ByValStoreOffset = 0; - unsigned TotalArgRegsSaveSize = 0; - unsigned ArgRegsSaveSizeMaxAlign = 4; - // Calculate the amount of stack space that we need to allocate to store // byval and variadic arguments that are passed in registers. // We need to know this before we allocate the first byval or variadic // argument, as they will be allocated a stack slot below the CFA (Canonical // Frame Address, the stack pointer at entry to the function). + unsigned ArgRegBegin = ARM::R4; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) + break; + CCValAssign &VA = ArgLocs[i]; - if (VA.isMemLoc()) { - int index = VA.getValNo(); - if (index != lastInsIndex) { - ISD::ArgFlagsTy Flags = Ins[index].Flags; - if (Flags.isByVal()) { - unsigned ExtraArgRegsSize; - unsigned ExtraArgRegsSaveSize; - computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProceed(), - Flags.getByValSize(), - ExtraArgRegsSize, ExtraArgRegsSaveSize); - - TotalArgRegsSaveSize += ExtraArgRegsSaveSize; - if (Flags.getByValAlign() > ArgRegsSaveSizeMaxAlign) - ArgRegsSaveSizeMaxAlign = Flags.getByValAlign(); - CCInfo.nextInRegsParam(); - } - lastInsIndex = index; - } - } + unsigned Index = VA.getValNo(); + ISD::ArgFlagsTy Flags = Ins[Index].Flags; + if (!Flags.isByVal()) + continue; + + assert(VA.isMemLoc() && "unexpected byval pointer in reg"); + unsigned RBegin, REnd; + CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); + ArgRegBegin = std::min(ArgRegBegin, RBegin); + + CCInfo.nextInRegsParam(); } CCInfo.rewindByValRegsInfo(); - lastInsIndex = -1; + + int lastInsIndex = -1; if (isVarArg && MFI->hasVAStart()) { - unsigned ExtraArgRegsSize; - unsigned ExtraArgRegsSaveSize; - computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0, - ExtraArgRegsSize, ExtraArgRegsSaveSize); - TotalArgRegsSaveSize += ExtraArgRegsSaveSize; + unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); + if (RegIdx != array_lengthof(GPRArgRegs)) + ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } - // If the arg regs save area contains N-byte aligned values, the - // bottom of it must be at least N-byte aligned. - TotalArgRegsSaveSize = RoundUpToAlignment(TotalArgRegsSaveSize, ArgRegsSaveSizeMaxAlign); - TotalArgRegsSaveSize = std::min(TotalArgRegsSaveSize, 16U); + + unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); + AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); + auto PtrVT = getPointerTy(DAG.getDataLayout()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[VA.getValNo()].OrigArgIndex; + if (Ins[VA.getValNo()].isOrigArg()) { + std::advance(CurOrigArg, + Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); + } // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); @@ -3111,7 +3070,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, SDValue ArgValue2; if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); @@ -3121,9 +3080,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, } ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue1, DAG.getIntPtrConstant(0)); + ArgValue, ArgValue1, + DAG.getIntPtrConstant(0, dl)); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue2, DAG.getIntPtrConstant(1)); + ArgValue, ArgValue2, + DAG.getIntPtrConstant(1, dl)); } else ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); @@ -3137,9 +3098,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, else if (RegVT == MVT::v2f64) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) - RC = AFI->isThumb1OnlyFunction() ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass; + RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -3177,7 +3137,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); - int index = ArgLocs[i].getValNo(); + int index = VA.getValNo(); // Some Ins[] entries become multiple ArgLoc[] entries. // Process them only once. @@ -3190,21 +3150,14 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { - unsigned CurByValIndex = CCInfo.getInRegsParamsProceed(); - - ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign()); - int FrameIndex = StoreByValRegs( - CCInfo, DAG, dl, Chain, CurOrigArg, - CurByValIndex, - Ins[VA.getValNo()].PartOffset, - VA.getLocMemOffset(), - Flags.getByValSize(), - true /*force mutable frames*/, - ByValStoreOffset, - TotalArgRegsSaveSize); - ByValStoreOffset += Flags.getByValSize(); - ByValStoreOffset = std::min(ByValStoreOffset, 16U); - InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy())); + assert(Ins[index].isOrigArg() && + "Byval arguments cannot be implicit"); + unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); + + int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg, + CurByValIndex, VA.getLocMemOffset(), + Flags.getByValSize()); + InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { unsigned FIOffset = VA.getLocMemOffset(); @@ -3212,7 +3165,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, FIOffset, true); // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0)); @@ -3245,6 +3198,18 @@ static bool isFloatingPointZero(SDValue Op) { if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isPosZero(); } + } else if (Op->getOpcode() == ISD::BITCAST && + Op->getValueType(0) == MVT::f64) { + // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) + // created by LowerConstantFP(). + SDValue BitcastOp = Op->getOperand(0); + if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) { + SDValue MoveOp = BitcastOp->getOperand(0); + if (MoveOp->getOpcode() == ISD::TargetConstant && + cast(MoveOp)->getZExtValue() == 0) { + return true; + } + } } return false; } @@ -3265,28 +3230,28 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, case ISD::SETGE: if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; - RHS = DAG.getConstant(C-1, MVT::i32); + RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETULT: case ISD::SETUGE: if (C != 0 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; - RHS = DAG.getConstant(C-1, MVT::i32); + RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETLE: case ISD::SETGT: if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; - RHS = DAG.getConstant(C+1, MVT::i32); + RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; case ISD::SETULE: case ISD::SETUGT: if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; - RHS = DAG.getConstant(C+1, MVT::i32); + RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; } @@ -3305,7 +3270,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, CompareType = ARMISD::CMPZ; break; } - ARMcc = DAG.getConstant(CondCode, MVT::i32); + ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); } @@ -3351,7 +3316,7 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue Value, OverflowCmp; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - + SDLoc dl(Op); // FIXME: We are currently always generating CMPs because we don't support // generating CMN through the backend. This is not as good as the natural @@ -3362,24 +3327,24 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: - ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32); - Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS); + ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); + Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::UADDO: - ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32); - Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS); + ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); + Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::SSUBO: - ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32); - Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS); + ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); + Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; case ISD::USUBO: - ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32); - Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS); - OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS); + ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); + Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); + OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; } // switch (...) @@ -3397,16 +3362,17 @@ ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDLoc dl(Op); // We use 0 and 1 as false and true values. - SDValue TVal = DAG.getConstant(1, MVT::i32); - SDValue FVal = DAG.getConstant(0, MVT::i32); + SDValue TVal = DAG.getConstant(1, dl, MVT::i32); + SDValue FVal = DAG.getConstant(0, dl, MVT::i32); EVT VT = Op.getValueType(); - SDValue Overflow = DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, TVal, FVal, + SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, CCR, OverflowCmp); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); + return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } @@ -3429,7 +3395,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); EVT VT = Op.getValueType(); - return getCMOV(SDLoc(Op), VT, SelectTrue, SelectFalse, ARMcc, CCR, + return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, OverflowCmp, DAG); } @@ -3472,19 +3438,13 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the // undefined bits before doing a full-word comparison with zero. Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, - DAG.getConstant(1, Cond.getValueType())); + DAG.getConstant(1, dl, Cond.getValueType())); return DAG.getSelectCC(dl, Cond, - DAG.getConstant(0, Cond.getValueType()), + DAG.getConstant(0, dl, Cond.getValueType()), SelectTrue, SelectFalse, ISD::SETNE); } -static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) { - if (CC == ISD::SETNE) - return ISD::SETEQ; - return ISD::getSetCCInverse(CC, true); -} - static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps) { // Start by selecting the GE condition code for opcodes that return true for @@ -3576,7 +3536,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { - RHS = DAG.getConstant(0, LHS.getValueType()); + RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } @@ -3592,12 +3552,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) - if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { - CC = getInverseCCForVSEL(CC); + CC = ISD::getSetCCInverse(CC, true); std::swap(TrueVal, FalseVal); } } @@ -3611,21 +3571,114 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); - // Try to generate VSEL on ARMv8. - if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { - // We can select VMAXNM/VMINNM from a compare followed by a select with the + // Try to generate VMAXNM/VMINNM on ARMv8. + if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { + // We can use VMAXNM/VMINNM for a compare followed by a select with the // same operands, as follows: - // c = fcmp [ogt, olt, ugt, ult] a, b + // c = fcmp [?gt, ?ge, ?lt, ?le] a, b // select c, a, b - // We only do this in unsafe-fp-math, because signed zeros and NaNs are - // handled differently than the original code sequence. - if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal && - RHS == FalseVal) { - if (CC == ISD::SETOGT || CC == ISD::SETUGT) - return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal); - if (CC == ISD::SETOLT || CC == ISD::SETULT) - return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal); + // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'. + bool swapSides = false; + if (!getTargetMachine().Options.NoNaNsFPMath) { + // transformability may depend on which way around we compare + switch (CC) { + default: + break; + case ISD::SETOGT: + case ISD::SETOGE: + case ISD::SETOLT: + case ISD::SETOLE: + // the non-NaN should be RHS + swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS); + break; + case ISD::SETUGT: + case ISD::SETUGE: + case ISD::SETULT: + case ISD::SETULE: + // the non-NaN should be LHS + swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS); + break; + } + } + swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal); + if (swapSides) { + CC = ISD::getSetCCSwappedOperands(CC); + std::swap(LHS, RHS); + } + if (LHS == TrueVal && RHS == FalseVal) { + bool canTransform = true; + // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here + if (!getTargetMachine().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { + const ConstantFPSDNode *Zero; + switch (CC) { + default: + break; + case ISD::SETOGT: + case ISD::SETUGT: + case ISD::SETGT: + // RHS must not be -0 + canTransform = (Zero = dyn_cast(RHS)) && + !Zero->isNegative(); + break; + case ISD::SETOGE: + case ISD::SETUGE: + case ISD::SETGE: + // LHS must not be -0 + canTransform = (Zero = dyn_cast(LHS)) && + !Zero->isNegative(); + break; + case ISD::SETOLT: + case ISD::SETULT: + case ISD::SETLT: + // RHS must not be +0 + canTransform = (Zero = dyn_cast(RHS)) && + Zero->isNegative(); + break; + case ISD::SETOLE: + case ISD::SETULE: + case ISD::SETLE: + // LHS must not be +0 + canTransform = (Zero = dyn_cast(LHS)) && + Zero->isNegative(); + break; + } + } + if (canTransform) { + // Note: If one of the elements in a pair is a number and the other + // element is NaN, the corresponding result element is the number. + // This is consistent with the IEEE 754-2008 standard. + // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN + switch (CC) { + default: + break; + case ISD::SETOGT: + case ISD::SETOGE: + if (!DAG.isKnownNeverNaN(RHS)) + break; + return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); + case ISD::SETUGT: + case ISD::SETUGE: + if (!DAG.isKnownNeverNaN(LHS)) + break; + case ISD::SETGT: + case ISD::SETGE: + return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); + case ISD::SETOLT: + case ISD::SETOLE: + if (!DAG.isKnownNeverNaN(RHS)) + break; + return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); + case ISD::SETULT: + case ISD::SETULE: + if (!DAG.isKnownNeverNaN(LHS)) + break; + case ISD::SETLT: + case ISD::SETLE: + return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); + } + } } bool swpCmpOps = false; @@ -3641,12 +3694,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { } } - SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { - SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32); + SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); @@ -3679,7 +3732,7 @@ static bool canChangeToInt(SDValue Op, bool &SeenZero, static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { if (isFloatingPointZero(Op)) - return DAG.getConstant(0, MVT::i32); + return DAG.getConstant(0, SDLoc(Op), MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) return DAG.getLoad(MVT::i32, SDLoc(Op), @@ -3692,15 +3745,17 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2) { + SDLoc dl(Op); + if (isFloatingPointZero(Op)) { - RetVal1 = DAG.getConstant(0, MVT::i32); - RetVal2 = DAG.getConstant(0, MVT::i32); + RetVal1 = DAG.getConstant(0, dl, MVT::i32); + RetVal2 = DAG.getConstant(0, dl, MVT::i32); return; } if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); - RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op), + RetVal1 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), @@ -3708,9 +3763,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); - SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op), - PtrType, Ptr, DAG.getConstant(4, PtrType)); - RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op), + SDValue NewPtr = DAG.getNode(ISD::ADD, dl, + PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); + RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), @@ -3745,7 +3800,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { else if (CC == ISD::SETUNE) CC = ISD::SETNE; - SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32); + SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); SDValue ARMcc; if (LHS.getValueType() == MVT::f32) { LHS = DAG.getNode(ISD::AND, dl, MVT::i32, @@ -3765,7 +3820,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); ARMCC::CondCodes CondCode = IntCCToARMCC(CC); - ARMcc = DAG.getConstant(CondCode, MVT::i32); + ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); @@ -3789,7 +3844,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { - RHS = DAG.getConstant(0, LHS.getValueType()); + RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } @@ -3815,14 +3870,14 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); - SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); if (CondCode2 != ARMCC::AL) { - ARMcc = DAG.getConstant(CondCode2, MVT::i32); + ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); } @@ -3835,13 +3890,11 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Index = Op.getOperand(2); SDLoc dl(Op); - EVT PTy = getPointerTy(); + EVT PTy = getPointerTy(DAG.getDataLayout()); JumpTableSDNode *JT = cast(Table); - ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo(); - SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); - Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId); - Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy)); + Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); + Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table); if (Subtarget->isThumb2()) { // Thumb2 uses a two-level jump. That is, it jumps into the jump table @@ -3849,7 +3902,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { // to translate it to TBB / TBH later. // FIXME: This might not work if the function is extremely large. return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, - Addr, Op.getOperand(2), JTI, UId); + Addr, Op.getOperand(2), JTI); } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, @@ -3857,13 +3910,13 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { false, false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); - return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, MachinePointerInfo::getJumpTable(), false, false, false, 0); Chain = Addr.getValue(1); - return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); + return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } } @@ -3890,7 +3943,6 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) @@ -3903,20 +3955,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { /*isSigned*/ false, SDLoc(Op)).first; } - SDLoc dl(Op); - unsigned Opc; - - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid opcode!"); - case ISD::FP_TO_SINT: - Opc = ARMISD::FTOSI; - break; - case ISD::FP_TO_UINT: - Opc = ARMISD::FTOUI; - break; - } - Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); - return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); + return Op; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -3956,7 +3995,6 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) @@ -3969,21 +4007,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { /*isSigned*/ false, SDLoc(Op)).first; } - SDLoc dl(Op); - unsigned Opc; - - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid opcode!"); - case ISD::SINT_TO_FP: - Opc = ARMISD::SITOF; - break; - case ISD::UINT_TO_FP: - Opc = ARMISD::UITOF; - break; - } - - Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0)); - return DAG.getNode(Opc, dl, VT, Op); + return Op; } SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { @@ -4001,12 +4025,12 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Use VBSL to copy the sign bit. unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, - DAG.getTargetConstant(EncodedVal, MVT::i32)); + DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), - DAG.getConstant(32, MVT::i32)); + DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); if (SrcVT == MVT::f32) { @@ -4014,16 +4038,16 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), - DAG.getConstant(32, MVT::i32)); + DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), - DAG.getConstant(32, MVT::i32)); + DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), - MVT::i32); + dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); @@ -4034,7 +4058,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::f32) { Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); } else { Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); } @@ -4049,8 +4073,8 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); // Or in the signbit with integer operations. - SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32); - SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32); + SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); + SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); if (VT == MVT::f32) { Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, @@ -4081,7 +4105,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = DAG.getConstant(4, MVT::i32); + SDValue Offset = DAG.getConstant(4, dl, MVT::i32); return DAG.getLoad(VT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); @@ -4113,14 +4137,35 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned ARMTargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { +unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", ARM::SP) .Default(0); if (Reg) return Reg; - report_fatal_error("Invalid register name global variable"); + report_fatal_error(Twine("Invalid register name \"" + + StringRef(RegName) + "\".")); +} + +// Result is 64 bit value so split into two 32 bit values and return as a +// pair of values. +static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) { + SDLoc DL(N); + + // This function is only supposed to be called for i64 type destination. + assert(N->getValueType(0) == MVT::i64 + && "ExpandREAD_REGISTER called for non-i64 type result."); + + SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, + DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), + N->getOperand(0), + N->getOperand(1)); + + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), + Read.getValue(1))); + Results.push_back(Read.getOperand(0)); } /// ExpandBITCAST - If the target supports VFP, this function is called to @@ -4143,9 +4188,9 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, DstVT, DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } @@ -4153,7 +4198,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { SDValue Cvt; - if (TLI.isBigEndian() && SrcVT.isVector() && + if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && SrcVT.getVectorNumElements() > 1) Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), @@ -4177,7 +4222,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! - SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); + SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -4200,17 +4245,17 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, - DAG.getConstant(VTBits, MVT::i32), ShAmt); + DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, - DAG.getConstant(VTBits, MVT::i32)); + DAG.getConstant(VTBits, dl, MVT::i32)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMcc, DAG, dl); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), + ISD::SETGE, ARMcc, DAG, dl); SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); @@ -4234,17 +4279,17 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, - DAG.getConstant(VTBits, MVT::i32), ShAmt); + DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, - DAG.getConstant(VTBits, MVT::i32)); + DAG.getConstant(VTBits, dl, MVT::i32)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMcc, DAG, dl); + SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), + ISD::SETGE, ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, CCR, Cmp); @@ -4261,20 +4306,94 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, - DAG.getConstant(Intrinsic::arm_get_fpscr, + DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, - DAG.getConstant(1U << 22, MVT::i32)); + DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, - DAG.getConstant(22, MVT::i32)); + DAG.getConstant(22, dl, MVT::i32)); return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, MVT::i32)); + DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); SDLoc dl(N); + EVT VT = N->getValueType(0); + if (VT.isVector()) { + assert(ST->hasNEON()); + + // Compute the least significant set bit: LSB = X & -X + SDValue X = N->getOperand(0); + SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); + + EVT ElemTy = VT.getVectorElementType(); + + if (ElemTy == MVT::i8) { + // Compute with: cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); + } + + if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && + (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { + // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 + unsigned NumBits = ElemTy.getSizeInBits(); + SDValue WidthMinus1 = + DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); + SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); + } + + // Compute with: cttz(x) = ctpop(lsb - 1) + + // Since we can only compute the number of bits in a byte with vcnt.8, we + // have to gather the result with pairwise addition (vpaddl) for i16, i32, + // and i64. + + // Compute LSB - 1. + SDValue Bits; + if (ElemTy == MVT::i64) { + // Load constant 0xffff'ffff'ffff'ffff to register. + SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(0x1eff, dl, MVT::i32)); + Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); + } else { + SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, + DAG.getTargetConstant(1, dl, ElemTy)); + Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); + } + + // Count #bits with vcnt.8. + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); + SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); + + // Gather the #bits with vpaddl (pairwise add.) + EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt8); + if (ElemTy == MVT::i16) + return Cnt16; + + EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; + SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt16); + if (ElemTy == MVT::i32) + return Cnt32; + + assert(ElemTy == MVT::i64); + SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), + Cnt32); + return Cnt64; + } if (!ST->hasV6T2Ops()) return SDValue(); @@ -4326,10 +4445,10 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { if (VT.is64BitVector()) { SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); } else { SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, - BitCounts, DAG.getIntPtrConstant(0)); + BitCounts, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); } } @@ -4368,10 +4487,10 @@ static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { if (VT.is64BitVector()) { SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); } else { SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); } } @@ -4405,7 +4524,8 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, // Left shifts translate directly to the vshiftu intrinsic. if (N->getOpcode() == ISD::SHL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), + DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, + MVT::i32), N->getOperand(0), N->getOperand(1)); assert((N->getOpcode() == ISD::SRA || @@ -4422,7 +4542,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, Intrinsic::arm_neon_vshifts : Intrinsic::arm_neon_vshiftu); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, MVT::i32), + DAG.getConstant(vshiftInt, dl, MVT::i32), N->getOperand(0), NegatedCount); } @@ -4448,9 +4568,9 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, dl, MVT::i32)); // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. @@ -4473,6 +4593,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); + EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); SDLoc dl(Op); @@ -4502,8 +4623,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1); + Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); break; case ISD::SETUO: Invert = true; // Fallthrough case ISD::SETO: @@ -4511,8 +4632,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; - Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0); - Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1); + Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); + Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); break; } } else { @@ -4546,8 +4667,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Opc = ARMISD::VTST; - Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0)); - Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1)); + Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); Invert = !Invert; } } @@ -4573,22 +4694,24 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (SingleOp.getNode()) { switch (Opc) { case ARMISD::VCEQ: - Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGE: - Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLEZ: - Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGT: - Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLTZ: - Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break; + Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; default: - Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } } else { - Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } + Result = DAG.getSExtOrTrunc(Result, dl, VT); + if (Invert) Result = DAG.getNOT(dl, Result, VT); @@ -4600,7 +4723,8 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - EVT &VT, bool is128Bits, NEONModImmType type) { + SDLoc dl, EVT &VT, bool is128Bits, + NEONModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a @@ -4715,7 +4839,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, ImmMask <<= 1; } - if (DAG.getTargetLoweringInfo().isBigEndian()) + if (DAG.getDataLayout().isBigEndian()) // swap higher and lower 32 bit word Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); @@ -4730,7 +4854,7 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, } unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); - return DAG.getTargetConstant(EncodedVal, MVT::i32); + return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); } SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, @@ -4760,11 +4884,11 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // It's a float and we are trying to use NEON operations where // possible. Lower it to a splat followed by an extract. SDLoc DL(Op); - SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32); + SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, DL, MVT::i32)); } // The rest of our options are NEON only, make sure that's allowed before @@ -4782,8 +4906,8 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, return SDValue(); // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). - SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT, - false, VMOVModImm); + SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), + VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, @@ -4795,11 +4919,11 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, DL, MVT::i32)); } // Finally, try a VMVN.i32 - NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT, + NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); @@ -4812,7 +4936,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, DL, MVT::i32)); } return SDValue(); @@ -4921,18 +5045,50 @@ static bool isVTBLMask(ArrayRef M, EVT VT) { return VT == MVT::v8i8 && M.size() == 8; } +// Checks whether the shuffle mask represents a vector transpose (VTRN) by +// checking that pairs of elements in the shuffle mask represent the same index +// in each vector, incrementing the expected index by 2 at each step. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} +// v2={e,f,g,h} +// WhichResult gives the offset for each element in the mask based on which +// of the two results it belongs to. +// +// The transpose can be represented either as: +// result1 = shufflevector v1, v2, result1_shuffle_mask +// result2 = shufflevector v1, v2, result2_shuffle_mask +// where v1/v2 and the shuffle masks have the same number of elements +// (here WhichResult (see below) indicates which result is being checked) +// +// or as: +// results = shufflevector v1, v2, shuffle_mask +// where both results are returned in one vector and the shuffle mask has twice +// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we +// want to check the low half and high half of the shuffle mask as if it were +// the other case static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + // If the mask is twice as long as the result then we need to check the upper + // and lower parts of the mask + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } @@ -4945,28 +5101,52 @@ static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } +// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking +// that the mask elements are either all even and in steps of size 2 or all odd +// and in steps of size 2. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with +// respect the how results are returned. static bool isVUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - if (M[i] < 0) continue; // ignore UNDEF indices - if ((unsigned) M[i] != 2 * i + WhichResult) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; ++j) { + if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) + return false; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -4982,18 +5162,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ if (EltSz == 64) return false; - unsigned Half = VT.getVectorNumElements() / 2; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned j = 0; j != 2; ++j) { - unsigned Idx = WhichResult; - for (unsigned i = 0; i != Half; ++i) { - int MIdx = M[i + j * Half]; - if (MIdx >= 0 && (unsigned) MIdx != Idx) - return false; - Idx += 2; + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + unsigned Half = NumElts / 2; + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += Half) { + unsigned Idx = WhichResult; + for (unsigned k = 0; k < Half; ++k) { + int MIdx = M[i + j + k]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5001,21 +5190,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return true; } +// Checks whether the shuffle mask represents a vector zip (VZIP) by checking +// that pairs of elements of the shufflemask represent the same index in each +// vector incrementing sequentially through the vectors. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with respect the how results +// are returned. static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5032,15 +5237,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5048,6 +5261,30 @@ static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ return true; } +/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), +/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. +static unsigned isNEONTwoResultShuffleMask(ArrayRef ShuffleMask, EVT VT, + unsigned &WhichResult, + bool &isV_UNDEF) { + isV_UNDEF = false; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VTRN; + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VUZP; + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return ARMISD::VZIP; + + isV_UNDEF = true; + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VTRN; + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VUZP; + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return ARMISD::VZIP; + + return 0; +} + /// \return true if this is a reverse operation on an vector. static bool isReverseMask(ArrayRef M, EVT VT) { unsigned NumElts = VT.getVectorNumElements(); @@ -5075,10 +5312,10 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, if (ST->isThumb1Only()) { if (Val <= 255 || ~Val <= 255) - return DAG.getConstant(Val, MVT::i32); + return DAG.getConstant(Val, dl, MVT::i32); } else { if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) - return DAG.getConstant(Val, MVT::i32); + return DAG.getConstant(Val, dl, MVT::i32); } return SDValue(); } @@ -5100,7 +5337,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, VmovVT, VT.is128BitVector(), + DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); @@ -5111,7 +5348,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isNEONModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, - DAG, VmovVT, VT.is128BitVector(), + DAG, dl, VmovVT, VT.is128BitVector(), VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); @@ -5122,7 +5359,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { int ImmVal = ARM_AM::getFP32Imm(SplatBits); if (ImmVal != -1) { - SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); + SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); } } @@ -5204,8 +5441,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), - Value, DAG.getConstant(index, MVT::i32)), - DAG.getConstant(index, MVT::i32)); + Value, DAG.getConstant(index, dl, MVT::i32)), + DAG.getConstant(index, dl, MVT::i32)); } else N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, Value->getOperand(0), Value->getOperand(1)); @@ -5221,7 +5458,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SmallVector Ops; Ops.push_back(N); Ops.push_back(Op.getOperand(I)); - Ops.push_back(DAG.getConstant(I, MVT::i32)); + Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); } } @@ -5285,7 +5522,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; - SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; @@ -5388,24 +5625,25 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, VEXTOffsets[i] = NumElts; ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(NumElts)); + DAG.getIntPtrConstant(NumElts, dl)); } else if (MaxElts[i] < NumElts) { // The extraction can just take the first half VEXTOffsets[i] = 0; ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); } else { // An actual VEXT is needed VEXTOffsets[i] = MinElts[i]; SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(NumElts)); + DAG.getIntPtrConstant(NumElts, dl)); ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(VEXTOffsets[i], MVT::i32)); + DAG.getConstant(VEXTOffsets[i], dl, + MVT::i32)); } } @@ -5463,7 +5701,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return true; } - bool ReverseVEXT; + bool ReverseVEXT, isV_UNDEF; unsigned Imm, WhichResult; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); @@ -5474,12 +5712,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, isVREVMask(M, VT, 16) || isVEXTMask(M, VT, ReverseVEXT, Imm) || isVTBLMask(M, VT) || - isVTRNMask(M, VT, WhichResult) || - isVUZPMask(M, VT, WhichResult) || - isVZIPMask(M, VT, WhichResult) || - isVTRN_v_undef_Mask(M, VT, WhichResult) || - isVUZP_v_undef_Mask(M, VT, WhichResult) || - isVZIP_v_undef_Mask(M, VT, WhichResult) || + isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); } @@ -5539,13 +5772,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, case OP_VDUP2: case OP_VDUP3: return DAG.getNode(ARMISD::VDUPLANE, dl, VT, - OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32)); + OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: return DAG.getNode(ARMISD::VEXT, dl, VT, OpLHS, OpRHS, - DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32)); + DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); case OP_VUZPL: case OP_VUZPR: return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), @@ -5572,7 +5805,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, SmallVector VTBLMask; for (ArrayRef::iterator I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) - VTBLMask.push_back(DAG.getConstant(*I, MVT::i32)); + VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); if (V2.getNode()->getOpcode() == ISD::UNDEF) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, @@ -5596,7 +5829,7 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, // into the bottom double word. The v8i16 case is similar. unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, - DAG.getConstant(ExtractNum, MVT::i32)); + DAG.getConstant(ExtractNum, DL, MVT::i32)); } static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { @@ -5640,7 +5873,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, - DAG.getConstant(Lane, MVT::i32)); + DAG.getConstant(Lane, dl, MVT::i32)); } bool ReverseVEXT; @@ -5649,7 +5882,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, - DAG.getConstant(Imm, MVT::i32)); + DAG.getConstant(Imm, dl, MVT::i32)); } if (isVREVMask(ShuffleMask, VT, 64)) @@ -5662,7 +5895,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (V2->getOpcode() == ISD::UNDEF && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, - DAG.getConstant(Imm, MVT::i32)); + DAG.getConstant(Imm, dl, MVT::i32)); } // Check for Neon shuffles that modify both input vectors in place. @@ -5671,25 +5904,53 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // these operations, DAG memoization will ensure that a single node is // used for both shuffles. unsigned WhichResult; - if (isVTRNMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVUZPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVZIPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - - if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + bool isV_UNDEF; + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, VT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + V2 = V1; + return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) + .getValue(WhichResult); + } + + // Also check for these shuffles through CONCAT_VECTORS: we canonicalize + // shuffles that produce a result larger than their operands with: + // shuffle(concat(v1, undef), concat(v2, undef)) + // -> + // shuffle(concat(v1, v2), undef) + // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). + // + // This is useful in the general case, but there are special cases where + // native shuffles produce larger results: the two-result ops. + // + // Look through the concat when lowering them: + // shuffle(concat(v1, v2), undef) + // -> + // concat(VZIP(v1, v2):0, :1) + // + if (V1->getOpcode() == ISD::CONCAT_VECTORS && + V2->getOpcode() == ISD::UNDEF) { + SDValue SubV1 = V1->getOperand(0); + SDValue SubV2 = V1->getOperand(1); + EVT SubVT = SubV1.getValueType(); + + // We expect these to have been canonicalized to -1. + assert(std::all_of(ShuffleMask.begin(), ShuffleMask.end(), [&](int i) { + return i < (int)VT.getVectorNumElements(); + }) && "Unexpected shuffle index into UNDEF operand!"); + + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + SubV2 = SubV1; + assert((WhichResult == 0) && + "In-place shuffle of concat can only have one result!"); + SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), + SubV1, SubV2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), + Res.getValue(1)); + } + } } // If the shuffle is not directly supported and it has 4 elements, use @@ -5730,7 +5991,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShuffleMask[i] < (int)NumElts ? V1 : V2, DAG.getConstant(ShuffleMask[i] & (NumElts-1), - MVT::i32))); + dl, MVT::i32))); } SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); @@ -5785,11 +6046,11 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { if (Op0.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); if (Op1.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), - DAG.getIntPtrConstant(1)); + DAG.getIntPtrConstant(1, dl)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } @@ -5805,7 +6066,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, if (BVN->getValueType(0) != MVT::v4i32 || BVN->getOpcode() != ISD::BUILD_VECTOR) return false; - unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; unsigned HiElt = 1 - LoElt; ConstantSDNode *Lo0 = dyn_cast(BVN->getOperand(LoElt)); ConstantSDNode *Hi0 = dyn_cast(BVN->getOperand(HiElt)); @@ -5950,7 +6211,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { SDNode *BVN = N->getOperand(0).getNode(); assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); - unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32, BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); } @@ -5961,14 +6222,15 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; + SDLoc dl(N); for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. - Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), + return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::getVectorVT(TruncVT, NumElts), Ops); } @@ -6081,14 +6343,15 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { // Get reciprocal estimate. // float4 recip = vrecpeq_f32(yf); Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y); + DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), + Y); // Because char has a smaller range than uchar, we can actually get away // without any newton steps. This requires that we use a weird bias // of 0xb000, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0xb000); X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); - Y = DAG.getConstant(0xb000, MVT::i32); + Y = DAG.getConstant(0xb000, dl, MVT::i32); Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y); X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); @@ -6113,9 +6376,10 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1); + DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), + N1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Because short has a smaller range than ushort, we can actually get away @@ -6124,7 +6388,7 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { // float4 result = as_float4(as_int4(xf*recip) + 0x89); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); - N1 = DAG.getConstant(0x89, MVT::i32); + N1 = DAG.getConstant(0x89, dl, MVT::i32); N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); @@ -6150,13 +6414,13 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 @@ -6185,13 +6449,13 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 @@ -6200,7 +6464,8 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { N0 = LowerCONCAT_VECTORS(N0, DAG); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, - DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32), + DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, + MVT::i32), N0); return N0; } @@ -6218,13 +6483,14 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { // recip *= vrecpsq_f32(yf, recip); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1); + DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), + BN1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, - DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32), + DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Simply multiplying by the reciprocal estimate can leave us a few ulps @@ -6233,7 +6499,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { // float4 result = as_float4(as_int4(xf*recip) + 2); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); - N1 = DAG.getConstant(2, MVT::i32); + N1 = DAG.getConstant(2, dl, MVT::i32); N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); @@ -6274,18 +6540,19 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); // Create stack object for sret. - const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy); - const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy); + auto &DL = DAG.getDataLayout(); + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); - SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy()); + SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); ArgListTy Args; ArgListEntry Entry; @@ -6305,7 +6572,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { const char *LibcallName = (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) @@ -6319,8 +6586,8 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(), false, false, false, 0); // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize())); + SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, + DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo(), false, false, false, 0); @@ -6350,12 +6617,12 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, // Under Power Management extensions, the cycle-count is: // mrc p15, #0, , c9, c13, #0 SDValue Ops[] = { N->getOperand(0), // Chain - DAG.getConstant(Intrinsic::arm_mrc, MVT::i32), - DAG.getConstant(15, MVT::i32), - DAG.getConstant(0, MVT::i32), - DAG.getConstant(9, MVT::i32), - DAG.getConstant(13, MVT::i32), - DAG.getConstant(0, MVT::i32) + DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getConstant(15, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(9, DL, MVT::i32), + DAG.getConstant(13, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32) }; Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, @@ -6365,13 +6632,13 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, // Intrinsic is defined to return 0 on unsupported platforms. Technically // there are older ARM CPUs that have implementation-specific ways of // obtaining this information (FIXME!). - Cycles32 = DAG.getConstant(0, MVT::i32); + Cycles32 = DAG.getConstant(0, DL, MVT::i32); OutChain = DAG.getEntryNode(); } SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, - Cycles32, DAG.getConstant(0, MVT::i32)); + Cycles32, DAG.getConstant(0, DL, MVT::i32)); Results.push_back(Cycles64); Results.push_back(OutChain); } @@ -6379,6 +6646,7 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); + case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: @@ -6409,6 +6677,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); @@ -6418,7 +6687,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); @@ -6463,6 +6733,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); + case ISD::READ_REGISTER: + ExpandREAD_REGISTER(N, Results, DAG); + break; case ISD::BITCAST: Res = ExpandBITCAST(N, DAG); break; @@ -6487,8 +6760,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, void ARMTargetLowering:: SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); @@ -6505,9 +6777,8 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); - const TargetRegisterClass *TRC = isThumb ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass; + const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass + : &ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = @@ -6571,9 +6842,9 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3, RegState::Kill)); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); - AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5) - .addFrameIndex(FI) - .addImm(36)); // &jbuf[1] :: pc + BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) + .addFrameIndex(FI) + .addImm(36); // &jbuf[1] :: pc AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg5, RegState::Kill) @@ -6601,20 +6872,17 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, } } -MachineBasicBlock *ARMTargetLowering:: -EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); +void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, + MachineBasicBlock *MBB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - ARMFunctionInfo *AFI = MF->getInfo(); MachineFrameInfo *MFI = MF->getFrameInfo(); int FI = MFI->getFunctionContextIndex(); - const TargetRegisterClass *TRC = Subtarget->isThumb() ? - (const TargetRegisterClass*)&ARM::tGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRnopcRegClass; + const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass + : &ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. @@ -6665,7 +6933,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); unsigned MJTI = JTI->createJumpTableIndex(LPadList); - unsigned UId = AFI->createJumpTableUId(); Reloc::Model RelocM = getTargetMachine().getRelocationModel(); // Create the MBBs for the dispatch code. @@ -6748,8 +7015,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { unsigned NewVReg3 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3) - .addJumpTableIndex(MJTI) - .addImm(UId)); + .addJumpTableIndex(MJTI)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultCC( @@ -6762,8 +7028,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg1) - .addJumpTableIndex(MJTI) - .addImm(UId); + .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) @@ -6781,9 +7046,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6808,8 +7073,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { unsigned NewVReg3 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) - .addJumpTableIndex(MJTI) - .addImm(UId)); + .addJumpTableIndex(MJTI)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) @@ -6838,8 +7102,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) .addReg(NewVReg6, RegState::Kill) - .addJumpTableIndex(MJTI) - .addImm(UId); + .addJumpTableIndex(MJTI); } else { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) @@ -6873,9 +7136,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); @@ -6900,8 +7163,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)))); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) - .addJumpTableIndex(MJTI) - .addImm(UId)); + .addJumpTableIndex(MJTI)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), @@ -6918,13 +7180,11 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg4) - .addJumpTableIndex(MJTI) - .addImm(UId); + .addJumpTableIndex(MJTI); } else { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) .addReg(NewVReg5, RegState::Kill) - .addJumpTableIndex(MJTI) - .addImm(UId); + .addJumpTableIndex(MJTI); } } @@ -6933,7 +7193,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { for (std::vector::iterator I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { MachineBasicBlock *CurMBB = *I; - if (SeenMBBs.insert(CurMBB)) + if (SeenMBBs.insert(CurMBB).second) DispContBB->addSuccessor(CurMBB); } @@ -7000,8 +7260,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const { // The instruction is gone now. MI->eraseFromParent(); - - return MBB; } static @@ -7119,8 +7377,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = BB; ++It; @@ -7146,9 +7403,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, UnitSize = 2; } else { // Check whether we can use NEON instructions. - if (!MF->getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && + if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; @@ -7162,14 +7417,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Select the correct opcode and register class for unit size load/store bool IsNeon = UnitSize >= 8; - TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass - : (const TargetRegisterClass *)&ARM::GPRRegClass; + TRC = (IsThumb1 || IsThumb2) ? &ARM::tGPRRegClass : &ARM::GPRRegClass; if (IsNeon) - VecTRC = UnitSize == 16 - ? (const TargetRegisterClass *)&ARM::DPairRegClass - : UnitSize == 8 - ? (const TargetRegisterClass *)&ARM::DPRRegClass - : nullptr; + VecTRC = UnitSize == 16 ? &ARM::DPairRegClass + : UnitSize == 8 ? &ARM::DPRRegClass + : nullptr; unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; @@ -7242,25 +7494,29 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); - if (IsThumb2) { + if (Subtarget->useMovt(*MF)) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp) - .addImm(LoopSize & 0xFFFF)); + AddDefaultPred(BuildMI(BB, dl, + TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16), + Vtmp).addImm(LoopSize & 0xFFFF)); if ((LoopSize & 0xFFFF0000) != 0) - AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd) - .addReg(Vtmp).addImm(LoopSize >> 16)); + AddDefaultPred(BuildMI(BB, dl, + TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16), + varEnd) + .addReg(Vtmp) + .addImm(LoopSize >> 16)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext()); const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty); + unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) - Align = getDataLayout()->getTypeAllocSize(C->getType()); + Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); if (IsThumb1) @@ -7354,7 +7610,7 @@ MachineBasicBlock * ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock *MBB) const { const TargetMachine &TM = getTargetMachine(); - const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetWindows() && @@ -7419,8 +7675,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = - getTargetMachine().getSubtargetImpl()->getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI->getOpcode()) { @@ -7585,6 +7840,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: + return BB; + + case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; @@ -7613,13 +7871,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, unsigned int ABSSrcReg = MI->getOperand(1).getReg(); unsigned int ABSDstReg = MI->getOperand(0).getReg(); + bool ABSSrcKIll = MI->getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class - unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? - (const TargetRegisterClass*)&ARM::rGPRRegClass : - (const TargetRegisterClass*)&ARM::GPRRegClass); + unsigned NewRsbDstReg = + MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, @@ -7647,7 +7905,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) - .addReg(ABSSrcReg, RegState::Kill) + .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); // insert PHI in SinkBB, @@ -7684,8 +7942,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // Rename pseudo opcodes. unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode()); if (NewOpc) { - const ARMBaseInstrInfo *TII = static_cast( - getTargetMachine().getSubtargetImpl()->getInstrInfo()); + const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 && @@ -7789,6 +8046,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, return false; // Fall through. case ISD::SIGN_EXTEND: { + SDLoc dl(N); EVT VT = N->getValueType(0); CC = N->getOperand(0); if (CC.getValueType() != MVT::i1) @@ -7797,12 +8055,13 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, if (AllOnes) // When looking for an AllOnes constant, N is an sext, and the 'other' // value is 0. - OtherOp = DAG.getConstant(0, VT); + OtherOp = DAG.getConstant(0, dl, VT); else if (N->getOpcode() == ISD::ZERO_EXTEND) // When looking for a 0 constant, N can be zext or sext. - OtherOp = DAG.getConstant(1, VT); + OtherOp = DAG.getConstant(1, dl, VT); else - OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT); + OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, + VT); return true; } } @@ -7941,10 +8200,12 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDLoc dl(N); + // Build operand list. SmallVector Ops; - Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, - TLI.getPointerTy())); + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, + TLI.getPointerTy(DAG.getDataLayout()))); // Input is the vector. Ops.push_back(Vec); @@ -7962,9 +8223,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1, llvm_unreachable("Invalid vector element type for padd optimization."); } - SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, Ops); + SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; - return DAG.getNode(ExtOp, SDLoc(N), VT, tmp); + return DAG.getNode(ExtOp, dl, VT, tmp); } static SDValue findMUL_LOHI(SDValue V) { @@ -7989,13 +8250,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, // a glue link from the first add to the second add. // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by // a S/UMLAL instruction. - // loAdd UMUL_LOHI - // \ / :lo \ :hi - // \ / \ [no multiline comment] - // ADDC | hiAdd - // \ :glue / / - // \ / / - // ADDE + // UMUL_LOHI + // / :lo \ :hi + // / \ [no multiline comment] + // loAdd -> ADDE | + // \ :glue / + // \ / + // ADDC <- hiAdd // assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC"); SDValue AddcOp0 = AddcNode->getOperand(0); @@ -8049,29 +8310,35 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, else IsLeftOperandMUL = true; if (MULOp == SDValue()) - return SDValue(); + return SDValue(); // Figure out the right opcode. unsigned Opc = MULOp->getOpcode(); unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; // Figure out the high and low input values to the MLAL node. - SDValue* HiMul = &MULOp; SDValue* HiAdd = nullptr; SDValue* LoMul = nullptr; SDValue* LowAdd = nullptr; + // Ensure that ADDE is from high result of ISD::SMUL_LOHI. + if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1))) + return SDValue(); + if (IsLeftOperandMUL) HiAdd = &AddeOp1; else HiAdd = &AddeOp0; - if (AddcOp0->getOpcode() == Opc) { + // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node + // whose low result is fed to the ADDC we are checking. + + if (AddcOp0 == MULOp.getValue(0)) { LoMul = &AddcOp0; LowAdd = &AddcOp1; } - if (AddcOp1->getOpcode() == Opc) { + if (AddcOp1 == MULOp.getValue(0)) { LoMul = &AddcOp1; LowAdd = &AddcOp0; } @@ -8079,9 +8346,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, if (!LoMul) return SDValue(); - if (LoMul->getNode() != HiMul->getNode()) - return SDValue(); - // Create the merged node. SelectionDAG &DAG = DCI.DAG; @@ -8255,14 +8519,14 @@ static SDValue PerformMULCombine(SDNode *N, V, DAG.getNode(ISD::SHL, DL, VT, V, - DAG.getConstant(Log2_32(MulAmt - 1), + DAG.getConstant(Log2_32(MulAmt - 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, V, - DAG.getConstant(Log2_32(MulAmt + 1), + DAG.getConstant(Log2_32(MulAmt + 1), DL, MVT::i32)), V); } else @@ -8275,7 +8539,7 @@ static SDValue PerformMULCombine(SDNode *N, V, DAG.getNode(ISD::SHL, DL, VT, V, - DAG.getConstant(Log2_32(MulAmtAbs + 1), + DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmtAbs - 1)) { // (mul x, -(2^N + 1)) => - (add (shl x, N), x) @@ -8283,10 +8547,10 @@ static SDValue PerformMULCombine(SDNode *N, V, DAG.getNode(ISD::SHL, DL, VT, V, - DAG.getConstant(Log2_32(MulAmtAbs-1), + DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, MVT::i32))); Res = DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, MVT::i32),Res); + DAG.getConstant(0, DL, MVT::i32), Res); } else return SDValue(); @@ -8294,7 +8558,7 @@ static SDValue PerformMULCombine(SDNode *N, if (ShiftAmt != 0) Res = DAG.getNode(ISD::SHL, DL, VT, - Res, DAG.getConstant(ShiftAmt, MVT::i32)); + Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); @@ -8323,7 +8587,7 @@ static SDValue PerformANDCombine(SDNode *N, EVT VbicVT; SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, VbicVT, VT.is128BitVector(), + DAG, dl, VbicVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = @@ -8366,7 +8630,7 @@ static SDValue PerformORCombine(SDNode *N, EVT VorrVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, VorrVT, VT.is128BitVector(), + DAG, dl, VorrVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = @@ -8470,8 +8734,8 @@ static SDValue PerformORCombine(SDNode *N, Val >>= countTrailingZeros(~Mask); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, - DAG.getConstant(Val, MVT::i32), - DAG.getConstant(Mask, MVT::i32)); + DAG.getConstant(Val, DL, MVT::i32), + DAG.getConstant(Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); @@ -8496,9 +8760,9 @@ static SDValue PerformORCombine(SDNode *N, // 2a unsigned amt = countTrailingZeros(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), - DAG.getConstant(amt, MVT::i32)); + DAG.getConstant(amt, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, - DAG.getConstant(Mask, MVT::i32)); + DAG.getConstant(Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); @@ -8512,9 +8776,9 @@ static SDValue PerformORCombine(SDNode *N, // 2b unsigned lsb = countTrailingZeros(Mask); Res = DAG.getNode(ISD::SRL, DL, VT, N00, - DAG.getConstant(lsb, MVT::i32)); + DAG.getConstant(lsb, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, - DAG.getConstant(Mask2, MVT::i32)); + DAG.getConstant(Mask2, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); @@ -8533,7 +8797,7 @@ static SDValue PerformORCombine(SDNode *N, return SDValue(); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), - DAG.getConstant(~Mask, MVT::i32)); + DAG.getConstant(~Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); @@ -8573,7 +8837,10 @@ static SDValue PerformBFICombine(SDNode *N, unsigned InvMask = cast(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; - unsigned Mask = (1 << Width)-1; + assert(Width < + static_cast(std::numeric_limits::digits) && + "undefined behavior"); + unsigned Mask = (1u << Width) - 1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), @@ -8611,14 +8878,14 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, LD->getAlignment()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, - DAG.getConstant(4, MVT::i32)); + DAG.getConstant(4, DL, MVT::i32)); SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), std::min(4U, LD->getAlignment() / 2)); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); - if (DCI.DAG.getTargetLoweringInfo().isBigEndian()) + if (DCI.DAG.getDataLayout().isBigEndian()) std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); return Result; @@ -8645,160 +8912,19 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformSTORECombine - Target-specific dag combine xforms for -/// ISD::STORE. -static SDValue PerformSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - StoreSDNode *St = cast(N); - if (St->isVolatile()) - return SDValue(); - - // Optimize trunc store (of multiple scalars) to shuffle and store. First, - // pack all of the elements in one place. Next, store to memory in fewer - // chunks. - SDValue StVal = St->getValue(); - EVT VT = StVal.getValueType(); - if (St->isTruncatingStore() && VT.isVector()) { - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT StVT = St->getMemoryVT(); - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); - unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); - - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); - - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); - - unsigned SizeRatio = FromEltSz / ToEltSz; - assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), - NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - SDLoc DL(St); - SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); - SmallVector ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = TLI.isBigEndian() ? (i+1) * SizeRatio - 1 : i * SizeRatio; - - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec.data()); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. - - // Find the largest store unit - MVT StoreType = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) - StoreType = Tp; - } - // Didn't find a legal store type. - if (!TLI.isTypeLegal(StoreType)) - return SDValue(); - - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); - SmallVector Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, - TLI.getPointerTy()); - SDValue BasePtr = St->getBasePtr(); - - // Perform one or more big stores into memory. - unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); - for (unsigned I = 0; I < E; I++) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - StoreType, ShuffWide, - DAG.getIntPtrConstant(I)); - SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); - BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, - Increment); - Chains.push_back(Ch); - } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (!ISD::isNormalStore(St)) - return SDValue(); - - // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and - // ARM stores of arguments in the same cache line. - if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && - StVal.getNode()->hasOneUse()) { - SelectionDAG &DAG = DCI.DAG; - bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian(); - SDLoc DL(St); - SDValue BasePtr = St->getBasePtr(); - SDValue NewST1 = DAG.getStore(St->getChain(), DL, - StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), - BasePtr, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment()); - - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, - DAG.getConstant(4, MVT::i32)); - return DAG.getStore(NewST1.getValue(0), DL, - StVal.getNode()->getOperand(isBigEndian ? 0 : 1), - OffsetPtr, St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), - std::min(4U, St->getAlignment() / 2)); - } - - if (StVal.getValueType() != MVT::i64 || - StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Bitcast an i64 store extracted from a vector to f64. - // Otherwise, the i64 value will be legalized to a pair of i32 values. - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(StVal); - SDValue IntVec = StVal.getOperand(0); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, - IntVec.getValueType().getVectorNumElements()); - SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); - SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - Vec, StVal.getOperand(1)); - dl = SDLoc(N); - SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); - // Make the DAGCombiner fold the bitcasts. - DCI.AddToWorklist(Vec.getNode()); - DCI.AddToWorklist(ExtElt.getNode()); - DCI.AddToWorklist(V.getNode()); - return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), St->getAlignment(), - St->getAAInfo()); -} - -/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node -/// are normal, non-volatile loads. If so, it is profitable to bitcast an -/// i64 vector to have f64 elements, since the value can then be loaded -/// directly into a VFP register. -static bool hasNormalLoadOperand(SDNode *N) { - unsigned NumElts = N->getValueType(0).getVectorNumElements(); - for (unsigned i = 0; i < NumElts; ++i) { - SDNode *Elt = N->getOperand(i).getNode(); - if (ISD::isNormalLoad(Elt) && !cast(Elt)->isVolatile()) - return true; - } - return false; -} +/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node +/// are normal, non-volatile loads. If so, it is profitable to bitcast an +/// i64 vector to have f64 elements, since the value can then be loaded +/// directly into a VFP register. +static bool hasNormalLoadOperand(SDNode *N) { + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ISD::isNormalLoad(Elt) && !cast(Elt)->isVolatile()) + return true; + } + return false; +} /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for /// ISD::BUILD_VECTOR. @@ -8918,7 +9044,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(V.getNode()); } - SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32); + SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); } Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); @@ -9006,18 +9132,21 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { DAG.getUNDEF(VT), NewMask.data()); } -/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and -/// NEON load/store intrinsics to merge base address updates. +/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, +/// NEON load/store intrinsics, and generic vector load/stores, to merge +/// base address updates. +/// For generic load/stores, the memory type is assumed to be a vector. +/// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; - bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || - N->getOpcode() == ISD::INTRINSIC_W_CHAIN); - unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); + const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || + N->getOpcode() == ISD::INTRINSIC_W_CHAIN); + const bool isStore = N->getOpcode() == ISD::STORE; + const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); + MemSDNode *MemN = cast(N); + SDLoc dl(N); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), @@ -9033,7 +9162,7 @@ static SDValue CombineBaseUpdate(SDNode *N, continue; // Find the new opcode for the updating load/store. - bool isLoad = true; + bool isLoadOp = true; bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; @@ -9056,19 +9185,19 @@ static SDValue CombineBaseUpdate(SDNode *N, case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; NumVecs = 4; isLaneOp = true; break; case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; - NumVecs = 1; isLoad = false; break; + NumVecs = 1; isLoadOp = false; break; case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; - NumVecs = 2; isLoad = false; break; + NumVecs = 2; isLoadOp = false; break; case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; - NumVecs = 3; isLoad = false; break; + NumVecs = 3; isLoadOp = false; break; case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; - NumVecs = 4; isLoad = false; break; + NumVecs = 4; isLoadOp = false; break; case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; - NumVecs = 2; isLoad = false; isLaneOp = true; break; + NumVecs = 2; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; - NumVecs = 3; isLoad = false; isLaneOp = true; break; + NumVecs = 3; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; - NumVecs = 4; isLoad = false; isLaneOp = true; break; + NumVecs = 4; isLoadOp = false; isLaneOp = true; break; } } else { isLaneOp = true; @@ -9077,15 +9206,24 @@ static SDValue CombineBaseUpdate(SDNode *N, case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; + case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; + NumVecs = 1; isLaneOp = false; break; + case ISD::STORE: NewOpc = ARMISD::VST1_UPD; + NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } } // Find the size of memory referenced by the load/store. EVT VecTy; - if (isLoad) + if (isLoadOp) { VecTy = N->getValueType(0); - else + } else if (isIntrinsic) { VecTy = N->getOperand(AddrOpIdx+1).getValueType(); + } else { + assert(isStore && "Node has to be a load, a store, or an intrinsic!"); + VecTy = N->getOperand(1).getValueType(); + } + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); @@ -9102,32 +9240,99 @@ static SDValue CombineBaseUpdate(SDNode *N, continue; } + // OK, we found an ADD we can fold into the base update. + // Now, create a _UPD node, taking care of not breaking alignment. + + EVT AlignedVecTy = VecTy; + unsigned Alignment = MemN->getAlignment(); + + // If this is a less-than-standard-aligned load/store, change the type to + // match the standard alignment. + // The alignment is overlooked when selecting _UPD variants; and it's + // easier to introduce bitcasts here than fix that. + // There are 3 ways to get to this base-update combine: + // - intrinsics: they are assumed to be properly aligned (to the standard + // alignment of the memory type), so we don't need to do anything. + // - ARMISD::VLDx nodes: they are only generated from the aforementioned + // intrinsics, so, likewise, there's nothing to do. + // - generic load/store instructions: the alignment is specified as an + // explicit operand, rather than implicitly as the standard alignment + // of the memory type (like the intrisics). We need to change the + // memory type to match the explicit alignment. That way, we don't + // generate non-standard-aligned ARMISD::VLDx nodes. + if (isa(N)) { + if (Alignment == 0) + Alignment = 1; + if (Alignment < VecTy.getScalarSizeInBits() / 8) { + MVT EltTy = MVT::getIntegerVT(Alignment * 8); + assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); + assert(!isLaneOp && "Unexpected generic load/store lane."); + unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); + AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); + } + // Don't set an explicit alignment on regular load/stores that we want + // to transform to VLD/VST 1_UPD nodes. + // This matches the behavior of regular load/stores, which only get an + // explicit alignment if the MMO alignment is larger than the standard + // alignment of the memory type. + // Intrinsics, however, always get an explicit alignment, set to the + // alignment of the MMO. + Alignment = 1; + } + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. EVT Tys[6]; - unsigned NumResultVecs = (isLoad ? NumVecs : 0); + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) - Tys[n] = VecTy; + Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); + + // Then, gather the new node's operands. SmallVector Ops; Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); - for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { - Ops.push_back(N->getOperand(i)); + + if (StoreSDNode *StN = dyn_cast(N)) { + // Try to match the intrinsic's signature + Ops.push_back(StN->getValue()); + } else { + // Loads (and of course intrinsics) match the intrinsics' signature, + // so just add all but the alignment operand. + for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) + Ops.push_back(N->getOperand(i)); } - MemIntrinsicSDNode *MemInt = cast(N); - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, - Ops, MemInt->getMemoryVT(), - MemInt->getMemOperand()); + + // For all node types, the alignment operand is always the last one. + Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); + + // If this is a non-standard-aligned STORE, the penultimate operand is the + // stored value. Bitcast it to the aligned type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { + SDValue &StVal = Ops[Ops.size()-2]; + StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); + } + + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, + Ops, AlignedVecTy, + MemN->getMemOperand()); // Update the uses. - std::vector NewResults; - for (unsigned i = 0; i < NumResultVecs; ++i) { + SmallVector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); + + // If this is an non-standard-aligned LOAD, the first result is the loaded + // value. Bitcast it to the expected result type. + if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { + SDValue &LdVal = NewResults[0]; + LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); @@ -9137,6 +9342,14 @@ static SDValue CombineBaseUpdate(SDNode *N, return SDValue(); } +static SDValue PerformVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + return CombineBaseUpdate(N, DCI); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -9250,6 +9463,166 @@ static SDValue PerformVDUPLANECombine(SDNode *N, return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } +static SDValue PerformLOADCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // If this is a legal vector load, try to combine it into a VLD1_UPD. + if (ISD::isNormalLoad(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + StoreSDNode *St = cast(N); + if (St->isVolatile()) + return SDValue(); + + // Optimize trunc store (of multiple scalars) to shuffle and store. First, + // pack all of the elements in one place. Next, store to memory in fewer + // chunks. + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + if (St->isTruncatingStore() && VT.isVector()) { + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDLoc DL(St); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() + ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + StoreType, ShuffWide, + DAG.getIntPtrConstant(I, DL)); + SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (!ISD::isNormalStore(St)) + return SDValue(); + + // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and + // ARM stores of arguments in the same cache line. + if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && + StVal.getNode()->hasOneUse()) { + SelectionDAG &DAG = DCI.DAG; + bool isBigEndian = DAG.getDataLayout().isBigEndian(); + SDLoc DL(St); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = DAG.getStore(St->getChain(), DL, + StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ), + BasePtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, + DAG.getConstant(4, DL, MVT::i32)); + return DAG.getStore(NewST1.getValue(0), DL, + StVal.getNode()->getOperand(isBigEndian ? 0 : 1), + OffsetPtr, St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(4U, St->getAlignment() / 2)); + } + + if (StVal.getValueType() == MVT::i64 && + StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + + // Bitcast an i64 store extracted from a vector to f64. + // Otherwise, the i64 value will be legalized to a pair of i32 values. + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(StVal); + SDValue IntVec = StVal.getOperand(0); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, + IntVec.getValueType().getVectorNumElements()); + SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); + SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + Vec, StVal.getOperand(1)); + dl = SDLoc(N); + SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); + // Make the DAGCombiner fold the bitcasts. + DCI.AddToWorklist(Vec.getNode()); + DCI.AddToWorklist(ExtElt.getNode()); + DCI.AddToWorklist(V.getNode()); + return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment(), + St->getAAInfo()); + } + + // If this is a legal vector store, try to combine it into a VST1_UPD. + if (ISD::isNormalStore(N) && VT.isVector() && + DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return CombineBaseUpdate(N, DCI); + + return SDValue(); +} + // isConstVecPow2 - Return true if each vector element is a power of 2, all // elements are the same constant, C, and Log2(C) ranges from 1 to 32. static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) @@ -9306,23 +9679,27 @@ static SDValue PerformVCVTCombine(SDNode *N, MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 || + NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would - // be lossy. + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. return SDValue(); } + SDLoc dl(N); unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - unsigned NumLanes = Op.getValueType().getVectorNumElements(); - SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), + SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, - DAG.getConstant(IntrinsicOpcode, MVT::i32), N0, - DAG.getConstant(Log2_64(C), MVT::i32)); + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), + N0, + DAG.getConstant(Log2_64(C), dl, MVT::i32)); if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) - FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv); + FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; } @@ -9364,19 +9741,20 @@ static SDValue PerformVDIVCombine(SDNode *N, return SDValue(); } + SDLoc dl(N); SDValue ConvInput = Op.getOperand(0); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : Intrinsic::arm_neon_vcvtfxu2fp; - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), - DAG.getConstant(IntrinsicOpcode, MVT::i32), - ConvInput, DAG.getConstant(Log2_64(C), MVT::i32)); + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), + ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate @@ -9404,7 +9782,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); @@ -9419,12 +9797,16 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + return true; + } + return false; } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. @@ -9435,6 +9817,15 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Don't do anything for most intrinsics. break; + case Intrinsic::arm_neon_vabds: + if (!N->getValueType(0).isInteger()) + return SDValue(); + return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::arm_neon_vabdu: + return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + // Vector shifts: check for immediate versions and lower them. // Note: This is done during DAG combining instead of DAG legalizing because // the build_vectors for 64-bit vector element shift counts are generally @@ -9537,8 +9928,9 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { VShiftOpc = ARMISD::VQRSHRNsu; break; } - return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), - N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); + SDLoc dl(N); + return DAG.getNode(VShiftOpc, dl, N->getValueType(0), + N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vshiftins: { @@ -9554,9 +9946,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } - return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), + SDLoc dl(N); + return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), N->getOperand(2), - DAG.getConstant(Cnt, MVT::i32)); + DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vqrshifts: @@ -9601,9 +9994,11 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: - if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) - return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(Cnt, MVT::i32)); + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { + SDLoc dl(N); + return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } break; case ISD::SRA: @@ -9611,8 +10006,9 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRs : ARMISD::VSHRu); - return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(Cnt, MVT::i32)); + SDLoc dl(N); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); } } return SDValue(); @@ -9838,10 +10234,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); + case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: - return CombineBaseUpdate(N, DCI); + return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::INTRINSIC_VOID: @@ -9861,7 +10258,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: - return CombineBaseUpdate(N, DCI); + return PerformVLDCombine(N, DCI); default: break; } break; @@ -9900,7 +10297,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses - if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) { + if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) *Fast = true; return true; @@ -9924,10 +10321,8 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, const Function *F = MF.getFunction(); // See if we can use NEON instructions for this... - if ((!IsMemset || ZeroMemset) && - Subtarget->hasNEON() && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) { + if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || @@ -9972,6 +10367,28 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + EVT VT = ExtVal.getValueType(); + + if (!isTypeLegal(VT)) + return false; + + // Don't create a loadext if we can fold the extension into a wide/long + // instruction. + // If there's more than one user instruction, the loadext is desirable no + // matter what. There can be two uses by the same instruction. + if (ExtVal->use_empty() || + !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) + return true; + + SDNode *U = *ExtVal->use_begin(); + if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + return false; + + return true; +} + bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; @@ -10119,9 +10536,10 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. -bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const { - EVT VT = getValueType(Ty, true); +bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + EVT VT = getValueType(DL, Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; @@ -10185,9 +10603,9 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // Thumb2 and ARM modes can use cmn for negative immediates. if (!Subtarget->isThumb()) - return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1; + return ARM_AM::getSOImmVal(std::abs(Imm)) != -1; if (Subtarget->isThumb2()) - return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1; + return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1; // Thumb1 doesn't have cmn, and only 8-bit immediates. return Imm >= 0 && Imm <= 255; } @@ -10198,7 +10616,7 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { // Same encoding for add/sub, just flip the sign. - int64_t AbsImm = llvm::abs64(Imm); + int64_t AbsImm = std::abs(Imm); if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(AbsImm) != -1; if (Subtarget->isThumb2()) @@ -10222,7 +10640,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, if (RHSC < 0 && RHSC > -256) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; - Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } @@ -10236,7 +10654,7 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, if (RHSC < 0 && RHSC > -0x1000) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; - Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); Base = Ptr->getOperand(0); return true; } @@ -10279,11 +10697,11 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, if (RHSC < 0 && RHSC > -0x100) { // 8 bits. assert(Ptr->getOpcode() == ISD::ADD); isInc = false; - Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. isInc = Ptr->getOpcode() == ISD::ADD; - Offset = DAG.getConstant(RHSC, RHS->getValueType(0)); + Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } @@ -10465,7 +10883,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType -ARMTargetLowering::getConstraintType(const std::string &Constraint) const { +ARMTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; @@ -10524,9 +10942,8 @@ ARMTargetLowering::getSingleConstraintMatchWeight( } typedef std::pair RCPair; -RCPair -ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const { +RCPair ARMTargetLowering::getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters switch (Constraint[0]) { @@ -10539,6 +10956,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return RCPair(0U, &ARM::hGPRRegClass); break; case 'r': + if (Subtarget->isThumb1Only()) + return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'w': if (VT == MVT::Other) @@ -10569,7 +10988,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); - return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops @@ -10728,7 +11147,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, } return; } - Result = DAG.getTargetConstant(CVal, Op.getValueType()); + Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); break; } @@ -10772,9 +11191,9 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), - getPointerTy()); + getPointerTy(DAG.getDataLayout())); - Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL); + Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); @@ -10796,7 +11215,7 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const SDValue Size = Op.getOperand(1); SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, - DAG.getConstant(2, MVT::i32)); + DAG.getConstant(2, DL, MVT::i32)); SDValue Flag; Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); @@ -10849,11 +11268,7 @@ bool ARM::isBitFieldInvertedMask(unsigned v) { // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's - unsigned TO = CountTrailingOnes_32(v); - unsigned LO = CountLeadingOnes_32(v); - v = (v >> TO) << TO; - v = (v << LO) >> LO; - return v == 0; + return isShiftedMask_32(~v); } /// isFPImmLegal - Returns true if the target can instruction select the @@ -10885,7 +11300,8 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vld4lane: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); + uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -10905,12 +11321,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeAllocSize(ArgTy) / 8; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); @@ -10924,12 +11341,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = true; Info.writeMem = false; @@ -10937,12 +11355,13 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { + auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.vol = true; Info.readMem = false; Info.writeMem = true; @@ -11095,14 +11514,46 @@ bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles -bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +TargetLoweringBase::AtomicRMWExpansionKind +ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= (Subtarget->isMClass() ? 32U : 64U); + return (Size <= (Subtarget->isMClass() ? 32U : 64U)) + ? AtomicRMWExpansionKind::LLSC + : AtomicRMWExpansionKind::None; } // This has so far only been implemented for MachO. bool ARMTargetLowering::useLoadStackGuardNode() const { - return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO; + return Subtarget->isTargetMachO(); +} + +bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + // If we do not have NEON, vector types are not natively supported. + if (!Subtarget->hasNEON()) + return false; + + // Floating point values and vector values map to the same register file. + // Therefore, althought we could do a store extract of a vector type, this is + // better to leave at float as we have more freedom in the addressing mode for + // those. + if (VectorTy->isFPOrFPVectorTy()) + return false; + + // If the index is unknown at compile time, this is very expensive to lower + // and it is not possible to combine the store with the extract. + if (!isa(Idx)) + return false; + + assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); + unsigned BitWidth = cast(VectorTy)->getBitWidth(); + // We can do a store + vector extract on any vector that fits perfectly in a D + // or Q register. + if (BitWidth == 64 || BitWidth == 128) { + Cost = 0; + return true; + } + return false; } Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, @@ -11161,17 +11612,178 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, if (!Subtarget->isLittle()) std::swap (Lo, Hi); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); - return Builder.CreateCall3(Strex, Lo, Hi, Addr); + return Builder.CreateCall(Strex, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; Type *Tys[] = { Addr->getType() }; Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); - return Builder.CreateCall2( - Strex, Builder.CreateZExtOrBitCast( - Val, Strex->getFunctionType()->getParamType(0)), - Addr); + return Builder.CreateCall( + Strex, {Builder.CreateZExtOrBitCast( + Val, Strex->getFunctionType()->getParamType(0)), + Addr}); +} + +/// \brief Lower an interleaved load into a vldN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 +bool ARMTargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + VectorType *VecTy = Shuffles[0]->getType(); + Type *EltTy = VecTy->getVectorElementType(); + + const DataLayout &DL = LI->getModule()->getDataLayout(); + unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); + bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; + + // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't + // support i64/f64 element). + if ((VecSize != 64 && VecSize != 128) || EltIs64Bits) + return false; + + // A pointer vector can not be the return type of the ldN intrinsics. Need to + // load integer vectors first and then convert to pointer vectors. + if (EltTy->isPointerTy()) + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + + static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, + Intrinsic::arm_neon_vld3, + Intrinsic::arm_neon_vld4}; + + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy); + + IRBuilder<> Builder(LI); + SmallVector Ops; + + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); + + CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SV = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(VldN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + + SV->replaceAllUsesWith(SubVec); + } + + return true; +} + +/// \brief Get a mask consisting of sequential integers starting from \p Start. +/// +/// I.e. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, + unsigned NumElts) { + SmallVector Mask; + for (unsigned i = 0; i < NumElts; i++) + Mask.push_back(Builder.getInt32(Start + i)); + + return ConstantVector::get(Mask); +} + +/// \brief Lower an interleaved store into a vstN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// vst3 instruction in CodeGen. +bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + + VectorType *VecTy = SVI->getType(); + assert(VecTy->getVectorNumElements() % Factor == 0 && + "Invalid interleaved store"); + + unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + Type *EltTy = VecTy->getVectorElementType(); + VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); + bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; + + // Skip illegal sub vector types and vector types of i64/f64 element (vstN + // doesn't support i64/f64 element). + if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits) + return false; + + Value *Op0 = SVI->getOperand(0); + Value *Op1 = SVI->getOperand(1); + IRBuilder<> Builder(SI); + + // StN intrinsics don't support pointer vectors as arguments. Convert pointer + // vectors to integer vectors. + if (EltTy->isPointerTy()) { + Type *IntTy = DL.getIntPtrType(EltTy); + + // Convert to the corresponding integer vector. + Type *IntVecTy = + VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); + Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); + Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); + + SubVecTy = VectorType::get(IntTy, NumSubElts); + } + + static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], SubVecTy); + + SmallVector Ops; + + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + + // Split the shufflevector operands into sub vectors for the new vstN call. + for (unsigned i = 0; i < Factor; i++) + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + return true; } enum HABaseType { @@ -11233,7 +11845,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, return (Members > 0 && Members <= 4); } -/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate. +/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of +/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when +/// passing according to AAPCS rules. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { if (getEffectiveCallingConv(CallConv, isVarArg) != @@ -11242,7 +11856,9 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( HABaseType Base = HA_UNKNOWN; uint64_t Members = 0; - bool result = isHomogeneousAggregate(Ty, Base, Members); - DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump()); - return result; + bool IsHA = isHomogeneousAggregate(Ty, Base, Members); + DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); + + bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); + return IsHA || IsIntArray; }