X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMISelLowering.cpp;h=22efa48709e97343d555a1c790d10058b9639f2b;hb=e246717c3a36a913fd4200776ed621649bb2b624;hp=f2d81755f4635908093f963e61a2f365f3b72f2f;hpb=3a1588a2e38d57de3c4277f071f2316fb3dbc37a;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index f2d81755f46..22efa48709e 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -12,8 +12,10 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "arm-isel" #include "ARM.h" #include "ARMAddressingModes.h" +#include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMISelLowering.h" #include "ARMMachineFunctionInfo.h" @@ -27,6 +29,7 @@ #include "llvm/Function.h" #include "llvm/GlobalValue.h" #include "llvm/Instruction.h" +#include "llvm/Instructions.h" #include "llvm/Intrinsics.h" #include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -40,6 +43,7 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -47,27 +51,23 @@ #include using namespace llvm; +STATISTIC(NumTailCalls, "Number of tail calls"); + +// This option should go away when tail calls fully work. +static cl::opt +EnableARMTailCalls("arm-tail-calls", cl::Hidden, + cl::desc("Generate tail calls (TEMPORARY OPTION)."), + cl::init(false)); + static cl::opt EnableARMLongCalls("arm-long-calls", cl::Hidden, - cl::desc("Generate calls via indirect call instructions."), + cl::desc("Generate calls via indirect call instructions"), cl::init(false)); -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State); +static cl::opt +ARMInterworking("arm-interworking", cl::Hidden, + cl::desc("Enable / disable ARM interworking (for debugging only)"), + cl::init(true)); void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT) { @@ -84,8 +84,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::i64 && ElemTy != MVT::f64) setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom); - if (ElemTy == MVT::i8 || ElemTy == MVT::i16) - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); if (ElemTy != MVT::i32) { setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand); setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand); @@ -94,7 +93,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, } setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); @@ -102,7 +101,14 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT, setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand); + for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + setTruncStoreAction(VT.getSimpleVT(), + (MVT::SimpleValueType)InnerVT, Expand); } + setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { @@ -146,6 +152,8 @@ static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget(); + RegInfo = TM.getRegisterInfo(); + Itins = TM.getInstrItineraryData(); if (Subtarget->isTargetDarwin()) { // Uses VFP for Thumb libfuncs if available. @@ -229,13 +237,157 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setLibcallName(RTLIB::SRL_I128, 0); setLibcallName(RTLIB::SRA_I128, 0); - // Libcalls should use the AAPCS base standard ABI, even if hard float - // is in effect, as per the ARM RTABI specification, section 4.1.2. if (Subtarget->isAAPCS_ABI()) { - for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) { - setLibcallCallingConv(static_cast(i), - CallingConv::ARM_AAPCS); - } + // Double-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 2 + setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd"); + setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv"); + setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul"); + setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub"); + setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS); + + // Double-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 3 + setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq"); + setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); + setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq"); + setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ); + setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt"); + setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); + setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple"); + setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); + setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge"); + setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); + setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt"); + setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); + setLibcallName(RTLIB::UO_F64, "__aeabi_dcmpun"); + setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); + setLibcallName(RTLIB::O_F64, "__aeabi_dcmpun"); + setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); + setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS); + + // Single-precision floating-point arithmetic helper functions + // RTABI chapter 4.1.2, Table 4 + setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd"); + setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv"); + setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul"); + setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub"); + setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS); + + // Single-precision floating-point comparison helper functions + // RTABI chapter 4.1.2, Table 5 + setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq"); + setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); + setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq"); + setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ); + setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt"); + setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); + setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple"); + setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); + setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge"); + setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); + setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt"); + setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); + setLibcallName(RTLIB::UO_F32, "__aeabi_fcmpun"); + setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); + setLibcallName(RTLIB::O_F32, "__aeabi_fcmpun"); + setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); + setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS); + + // Floating-point to integer conversions. + // RTABI chapter 4.1.2, Table 6 + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz"); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz"); + setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz"); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz"); + setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz"); + setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS); + + // Conversions between floating types. + // RTABI chapter 4.1.2, Table 7 + setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f"); + setLibcallName(RTLIB::FPEXT_F32_F64, "__aeabi_f2d"); + setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS); + + // Integer to floating-point conversions. + // RTABI chapter 4.1.2, Table 8 + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d"); + setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d"); + setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f"); + setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f"); + setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f"); + setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS); + + // Long long helper functions + // RTABI chapter 4.2, Table 9 + setLibcallName(RTLIB::MUL_I64, "__aeabi_lmul"); + setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod"); + setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod"); + setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl"); + setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr"); + setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr"); + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS); + + // Integer division functions + // RTABI chapter 4.3.1 + setLibcallName(RTLIB::SDIV_I8, "__aeabi_idiv"); + setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv"); + setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv"); + setLibcallName(RTLIB::UDIV_I8, "__aeabi_uidiv"); + setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv"); + setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv"); + setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS); + setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS); } if (Subtarget->isThumb1Only()) @@ -244,7 +396,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) addRegisterClass(MVT::i32, ARM::GPRRegisterClass); if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, ARM::SPRRegisterClass); - addRegisterClass(MVT::f64, ARM::DPRRegisterClass); + if (!Subtarget->isFPOnlySP()) + addRegisterClass(MVT::f64, ARM::DPRRegisterClass); setTruncStoreAction(MVT::f64, MVT::f32, Expand); } @@ -290,9 +443,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); - setOperationAction(ISD::MUL, MVT::v2i64, Expand); + // Custom handling for some quad-vector types to detect VMULL. + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::VSETCC, MVT::v1i64, Expand); setOperationAction(ISD::VSETCC, MVT::v2i64, Expand); @@ -304,6 +462,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); } computeRegisterProperties(); @@ -360,8 +520,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::BSWAP, MVT::i32, Expand); // These are expanded into libcalls. - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); + if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) { + // v7M has a hardware divider + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); @@ -373,6 +536,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + setOperationAction(ISD::TRAP, MVT::Other, Legal); + // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Expand); @@ -385,28 +550,83 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // doesn't yet know how to not do that for SjLj. setExceptionSelectorRegister(ARM::R0); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); - setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); - - if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) { + // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use + // the default expansion. + if (Subtarget->hasDataBarrier() || + (Subtarget->hasV6Ops() && !Subtarget->isThumb())) { + // membarrier needs custom lowering; the rest are legal and handled + // normally. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom); + } else { + // Set them all for expansion, which will force libcalls. + setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); + // Since the libcalls include locking, fold in the fences + setShouldFoldAtomicFences(true); + } + // 64-bit versions are always libcalls (for now) + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand); + + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + + // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. + if (!Subtarget->hasV6Ops()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. - setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom); + setOperationAction(ISD::BITCAST, MVT::i64, Custom); + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->isTargetDarwin()) { + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom); + } setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::SETCC, MVT::f64, Expand); - setOperationAction(ISD::SELECT, MVT::i32, Expand); - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::f64, Expand); + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); @@ -451,32 +671,58 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::MUL); + + if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) + setTargetDAGCombine(ISD::OR); + if (Subtarget->hasNEON()) + setTargetDAGCombine(ISD::AND); setStackPointerRegisterToSaveRestore(ARM::SP); - setSchedulingPreference(SchedulingForRegPressure); - - // FIXME: If-converter should use instruction latency to determine - // profitability rather than relying on fixed limits. - if (Subtarget->getCPUString() == "generic") { - // Generic (and overly aggressive) if-conversion limits. - setIfCvtBlockSizeLimit(10); - setIfCvtDupBlockSizeLimit(2); - } else if (Subtarget->hasV7Ops()) { - setIfCvtBlockSizeLimit(3); - setIfCvtDupBlockSizeLimit(1); - } else if (Subtarget->hasV6Ops()) { - setIfCvtBlockSizeLimit(2); - setIfCvtDupBlockSizeLimit(1); - } else { - setIfCvtBlockSizeLimit(3); - setIfCvtDupBlockSizeLimit(2); - } + + if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Hybrid); maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type - // Do not enable CodePlacementOpt for now: it currently runs after the - // ARMConstantIslandPass and messes up branch relaxation and placement - // of constant islands. - // benefitFromCodePlacementOpt = true; + + // On ARM arguments smaller than 4 bytes are extended, so all arguments + // are at least 4 bytes aligned. + setMinStackArgumentAlignment(4); + + benefitFromCodePlacementOpt = true; +} + +std::pair +ARMTargetLowering::findRepresentativeClass(EVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + // Use DPR as representative register class for all floating point + // and vector types. Since there are 32 SPR registers and 32 DPR registers so + // the cost is 1 for both f32 and f64. + case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: + case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: + RRC = ARM::DPRRegisterClass; + break; + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + RRC = ARM::DPRRegisterClass; + Cost = 2; + break; + case MVT::v4i64: + RRC = ARM::DPRRegisterClass; + Cost = 4; + break; + case MVT::v8i64: + RRC = ARM::DPRRegisterClass; + Cost = 8; + break; + } + return std::make_pair(RRC, Cost); } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -497,6 +743,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::CNEG: return "ARMISD::CNEG"; @@ -512,18 +759,23 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; - case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; - case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; + case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; + case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_DISPATCHSETUP:return "ARMISD::EH_SJLJ_DISPATCHSETUP"; + + case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; case ARMISD::MEMBARRIER: return "ARMISD::MEMBARRIER"; - case ARMISD::SYNCBARRIER: return "ARMISD::SYNCBARRIER"; + case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; + + case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCGE: return "ARMISD::VCGE"; @@ -553,6 +805,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; + case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; + case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; case ARMISD::VDUP: return "ARMISD::VDUP"; case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; case ARMISD::VEXT: return "ARMISD::VEXT"; @@ -562,14 +816,100 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; + case ARMISD::VMULLs: return "ARMISD::VMULLs"; + case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::FMAX: return "ARMISD::FMAX"; case ARMISD::FMIN: return "ARMISD::FMIN"; + case ARMISD::BFI: return "ARMISD::BFI"; + case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; + case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; + case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; + case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; + case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; + } +} + +/// getRegClassFor - Return the register class that should be used for the +/// specified value type. +TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const { + // Map v4i64 to QQ registers but do not make the type legal. Similarly map + // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to + // load / store 4 to 8 consecutive D registers. + if (Subtarget->hasNEON()) { + if (VT == MVT::v4i64) + return ARM::QQPRRegisterClass; + else if (VT == MVT::v8i64) + return ARM::QQQQPRRegisterClass; } + return TargetLowering::getRegClassFor(VT); +} + +// Create a fast isel object. +FastISel * +ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { + return ARM::createFastISel(funcInfo); } /// getFunctionAlignment - Return the Log2 alignment of this function. unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { - return getTargetMachine().getSubtarget().isThumb() ? 0 : 1; + return getTargetMachine().getSubtarget().isThumb() ? 1 : 2; +} + +/// getMaximalGlobalOffset - Returns the maximal possible offset which can +/// be used for loads / stores from the global. +unsigned ARMTargetLowering::getMaximalGlobalOffset() const { + return (Subtarget->isThumb1Only() ? 127 : 4095); +} + +Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { + unsigned NumVals = N->getNumValues(); + if (!NumVals) + return Sched::RegPressure; + + for (unsigned i = 0; i != NumVals; ++i) { + EVT VT = N->getValueType(i); + if (VT == MVT::Flag || VT == MVT::Other) + continue; + if (VT.isFloatingPoint() || VT.isVector()) + return Sched::Latency; + } + + if (!N->isMachineOpcode()) + return Sched::RegPressure; + + // Load are scheduled for latency even if there instruction itinerary + // is not available. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrDesc &TID = TII->get(N->getMachineOpcode()); + + if (TID.getNumDefs() == 0) + return Sched::RegPressure; + if (!Itins->isEmpty() && + Itins->getOperandCycle(TID.getSchedClass(), 0) > 2) + return Sched::Latency; + + return Sched::RegPressure; +} + +unsigned +ARMTargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameInfo *TFI = MF.getTarget().getFrameInfo(); + + switch (RC->getID()) { + default: + return 0; + case ARM::tGPRRegClassID: + return TFI->hasFP(MF) ? 4 : 5; + case ARM::GPRRegClassID: { + unsigned FP = TFI->hasFP(MF) ? 1 : 0; + return 10 - FP - (Subtarget->isR9Reserved() ? 1 : 0); + } + case ARM::SPRRegClassID: // Currently not used as 'rep' register class. + case ARM::DPRRegClassID: + return 32 - 10; + } } //===----------------------------------------------------------------------===// @@ -628,131 +968,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, #include "ARMGenCallingConv.inc" -// APCS f64 is in register pairs, possibly split to stack -static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - CCState &State, bool CanFail) { - static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; - - // Try to get the first register. - if (unsigned Reg = State.AllocateReg(RegList, 4)) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - else { - // For the 2nd half of a v2f64, do not fail. - if (CanFail) - return false; - - // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 4), - LocVT, LocInfo)); - return true; - } - - // Try to get the second register. - if (unsigned Reg = State.AllocateReg(RegList, 4)) - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - else - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(4, 4), - LocVT, LocInfo)); - return true; -} - -static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) - return false; - if (LocVT == MVT::v2f64 && - !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) - return false; - return true; // we handled it -} - -// AAPCS f64 is in aligned register pairs -static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - CCState &State, bool CanFail) { - static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; - static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; - - unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); - if (Reg == 0) { - // For the 2nd half of a v2f64, do not just fail. - if (CanFail) - return false; - - // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 8), - LocVT, LocInfo)); - return true; - } - - unsigned i; - for (i = 0; i < 2; ++i) - if (HiRegList[i] == Reg) - break; - - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - LocVT, LocInfo)); - return true; -} - -static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true)) - return false; - if (LocVT == MVT::v2f64 && - !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false)) - return false; - return true; // we handled it -} - -static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, CCState &State) { - static const unsigned HiRegList[] = { ARM::R0, ARM::R2 }; - static const unsigned LoRegList[] = { ARM::R1, ARM::R3 }; - - unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2); - if (Reg == 0) - return false; // we didn't handle it - - unsigned i; - for (i = 0; i < 2; ++i) - if (HiRegList[i] == Reg) - break; - - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], - LocVT, LocInfo)); - return true; -} - -static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) - return false; - if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State)) - return false; - return true; // we handled it -} - -static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, - CCState &State) { - return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, - State); -} - /// CCAssignFnForNode - Selects the correct CCAssignFn for a the /// given CallingConvention value. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, @@ -761,23 +976,29 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, switch (CC) { default: llvm_unreachable("Unsupported calling convention"); - case CallingConv::C: case CallingConv::Fast: + if (Subtarget->hasVFP2() && !isVarArg) { + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); + // For AAPCS ABI targets, just use VFP variant of the calling convention. + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + } + // Fallthrough + case CallingConv::C: { // Use target triple & subtarget features to do actual dispatch. - if (Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && - FloatABIType == FloatABI::Hard && !isVarArg) - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); - else - return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); - } else - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + if (!Subtarget->isAAPCS_ABI()) + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); + else if (Subtarget->hasVFP2() && + FloatABIType == FloatABI::Hard && !isVarArg) + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); + } case CallingConv::ARM_AAPCS_VFP: - return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); + return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); case CallingConv::ARM_AAPCS: - return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); + return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); case CallingConv::ARM_APCS: - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); } } @@ -788,7 +1009,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) const { // Assign locations to each value returned by this call. SmallVector RVLocs; @@ -844,7 +1065,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val); + Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); break; } @@ -867,7 +1088,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile=*/false, /*AlwaysInline=*/false, - NULL, 0, NULL, 0); + MachinePointerInfo(0), MachinePointerInfo(0)); } /// LowerMemOpCallTo - Store the argument to the stack. @@ -876,15 +1097,15 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, DebugLoc dl, SelectionDAG &DAG, const CCValAssign &VA, - ISD::ArgFlagsTy Flags) { + ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); - if (Flags.isByVal()) { + if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - } + return DAG.getStore(Chain, dl, Arg, PtrOff, - PseudoSourceValue::getStack(), LocMemOffset, + MachinePointerInfo::getStack(LocMemOffset), false, false, 0); } @@ -894,7 +1115,7 @@ void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVector &MemOpChains, - ISD::ArgFlagsTy Flags) { + ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); @@ -921,11 +1142,28 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { - // ARM target does not yet support tail call optimization. - isTailCall = false; + SmallVectorImpl &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool IsSibCall = false; + // Temporarily disable tail calls so things don't break. + if (!EnableARMTailCalls) + isTailCall = false; + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, OutVals, Ins, DAG); + // We don't support GuaranteedTailCallOpt for ARM, only automatically + // detected sibcalls. + if (isTailCall) { + ++NumTailCalls; + IsSibCall = true; + } + } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -938,9 +1176,14 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + // For tail calls, memory operands are available in our caller's stack. + if (IsSibCall) + NumBytes = 0; + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy()); @@ -953,7 +1196,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = Outs[realArgIdx].Val; + SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; // Promote the value if needed. @@ -970,7 +1213,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } @@ -1001,7 +1244,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - } else { + } else if (!IsSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1016,10 +1259,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); + // Tail call byval lowering might overwrite argument registers so in case of + // tail call optimization the copies to registers are lowered later. + if (!isTailCall) + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + + // Do not flag preceeding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + InFlag =SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -1028,7 +1293,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, bool isDirect = false; bool isARMFunc = false; bool isLocalARMFunc = false; - MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); if (EnableARMLongCalls) { @@ -1049,7 +1313,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); @@ -1063,7 +1327,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { @@ -1074,7 +1338,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, getTargetMachine().getRelocationModel() != Reloc::Static; isARMFunc = !Subtarget->isThumb() || isStub; // ARM call to a local ARM function is predicable. - isLocalARMFunc = !Subtarget->isThumb() && !isExt; + isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking); // tBX takes a register source operand. if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); @@ -1085,13 +1349,19 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, getPointerTy(), Callee, PICLabel); - } else - Callee = DAG.getTargetGlobalAddress(GV, getPointerTy()); + } else { + // On ELF targets for PIC code, direct calls should go through the PLT + unsigned OpFlags = 0; + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; bool isStub = Subtarget->isTargetDarwin() && @@ -1107,13 +1377,19 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, getPointerTy(), Callee, PICLabel); - } else - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); + } else { + unsigned OpFlags = 0; + // On ELF targets for PIC code, direct calls should go through the PLT + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) + OpFlags = ARMII::MO_PLT; + Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags); + } } // FIXME: handle tail calls differently. @@ -1128,11 +1404,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL) : ARMISD::CALL_NOLINK; } - if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) { - // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK - Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag); - InFlag = Chain.getValue(1); - } std::vector Ops; Ops.push_back(Chain); @@ -1146,9 +1417,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + if (isTailCall) + return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); + // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), - &Ops[0], Ops.size()); + Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), @@ -1162,11 +1437,187 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee, dl, DAG, InVals); } +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const ARMInstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast(Arg.getOperand(1))->getReg(); + if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + return false; + } + } else if (LoadSDNode *Ld = dyn_cast(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, + SelectionDAG& DAG) const { + const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Do not sibcall optimize vararg calls unless the call site is not passing + // any arguments. + if (isVarArg && !Outs.empty()) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo:: + // emitEpilogue is not ready for them. + // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take + // LR. This means if we need to reload LR, it takes an extra instructions, + // which outweighs the value of the tail call; but here we don't know yet + // whether LR is going to be used. Probably the right approach is to + // generate the tail call here and turn it back into CALL/RET in + // emitEpilogue if LR is used. + + // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, + // but we need to make sure there are enough registers; the only valid + // registers are the 4 used for parameters. We don't currently do this + // case. + if (Subtarget->isThumb1Only()) + return false; + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector RVLocs1; + CCState CCInfo1(CalleeCC, false, getTargetMachine(), + RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg)); + + SmallVector RVLocs2; + CCState CCInfo2(CallerCC, false, getTargetMachine(), + RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), + ArgLocs, *DAG.getContext()); + CCInfo.AnalyzeCallOperands(Outs, + CCAssignFnForNode(CalleeCC, false, isVarArg)); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const ARMInstrInfo *TII = + ((ARMTargetMachine&)getTargetMachine()).getInstrInfo(); + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); + i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (VA.needsCustom()) { + // f64 and vector types are split into multiple registers or + // register/stack-slot combinations. The types will not match + // the registers; give up on memory f64 refs until we figure + // out what to do about this. + if (!VA.isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + if (RegVT == MVT::v2f64) { + if (!ArgLocs[++i].isRegLoc()) + return false; + if (!ArgLocs[++i].isRegLoc()) + return false; + } + } else if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + } + + return true; +} + SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, - DebugLoc dl, SelectionDAG &DAG) { + const SmallVectorImpl &OutVals, + DebugLoc dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; @@ -1196,13 +1647,13 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Arg = Outs[realRVLocIdx].Val; + SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } @@ -1252,6 +1703,61 @@ ARMTargetLowering::LowerReturn(SDValue Chain, return result; } +bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + unsigned NumCopies = 0; + SDNode* Copies[2]; + SDNode *Use = *N->use_begin(); + if (Use->getOpcode() == ISD::CopyToReg) { + Copies[NumCopies++] = Use; + } else if (Use->getOpcode() == ARMISD::VMOVRRD) { + // f64 returned in a pair of GPRs. + for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != ISD::CopyToReg) + return false; + Copies[UI.getUse().getResNo()] = *UI; + ++NumCopies; + } + } else if (Use->getOpcode() == ISD::BITCAST) { + // f32 returned in a single GPR. + if (!Use->hasNUsesOfValue(1, 0)) + return false; + Use = *Use->use_begin(); + if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0)) + return false; + Copies[NumCopies++] = Use; + } else { + return false; + } + + if (NumCopies != 1 && NumCopies != 2) + return false; + + bool HasRet = false; + for (unsigned i = 0; i < NumCopies; ++i) { + SDNode *Copy = Copies[i]; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() == ISD::CopyToReg) { + SDNode *Use = *UI; + if (Use == Copies[0] || Use == Copies[1]) + continue; + return false; + } + if (UI->getOpcode() != ARMISD::RET_FLAG) + return false; + HasRet = true; + } + } + + return HasRet; +} + // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise @@ -1273,7 +1779,12 @@ static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } -SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { +unsigned ARMTargetLowering::getJumpTableEncoding() const { + return MachineJumpTableInfo::EK_Inline; +} + +SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; @@ -1294,7 +1805,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); if (RelocM == Reloc::Static) return Result; @@ -1305,7 +1816,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { DebugLoc dl = GA->getDebugLoc(); EVT PtrVT = getPointerTy(); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; @@ -1314,11 +1825,11 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId(); ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, - ARMCP::CPValue, PCAdj, "tlsgd", true); + ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue Chain = Argument.getValue(1); @@ -1344,7 +1855,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { const GlobalValue *GV = GA->getGlobal(); DebugLoc dl = GA->getDebugLoc(); SDValue Offset; @@ -1361,11 +1872,11 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, - ARMCP::CPValue, PCAdj, "gottpoff", true); + ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); Chain = Offset.getValue(1); @@ -1373,15 +1884,15 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } else { // local exec model - ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff"); + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } @@ -1391,7 +1902,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, } SDValue -ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { +ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // TODO: implement the "local dynamic" model assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); @@ -1405,7 +1916,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); DebugLoc dl = Op.getDebugLoc(); const GlobalValue *GV = cast(Op)->getGlobal(); @@ -1413,39 +1924,38 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, if (RelocM == Reloc::PIC_) { bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); ARMConstantPoolValue *CPV = - new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT"); + new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue Chain = Result.getValue(1); SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); if (!UseGOTOFF) Result = DAG.getLoad(PtrVT, dl, Chain, Result, - PseudoSourceValue::getGOT(), 0, - false, false, 0); + MachinePointerInfo::getGOT(), false, false, 0); return Result; } else { // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. if (Subtarget->useMovt()) { return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, - DAG.getTargetGlobalAddress(GV, PtrVT)); + DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); } } } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; @@ -1466,7 +1976,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue Chain = Result.getValue(1); @@ -1476,15 +1986,14 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, } if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) - Result = DAG.getLoad(PtrVT, dl, Chain, Result, - PseudoSourceValue::getGOT(), 0, + Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(), false, false, 0); return Result; } SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, - SelectionDAG &DAG){ + SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); MachineFunction &MF = DAG.getMachineFunction(); @@ -1499,15 +2008,38 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } +SDValue +ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) + const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + SDValue Val = DAG.getConstant(0, MVT::i32); + return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0), + Op.getOperand(1), Val); +} + +SDValue +ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(0, MVT::i32)); +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { + const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); switch (IntNo) { @@ -1533,9 +2065,8 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0, + MachinePointerInfo::getConstantPool(), false, false, 0); - SDValue Chain = Result.getValue(1); if (RelocM == Reloc::PIC_) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32); @@ -1543,100 +2074,81 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } return Result; } - case Intrinsic::eh_sjlj_setjmp: - SDValue Val = Subtarget->isThumb() ? - DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) : - DAG.getConstant(0, MVT::i32); - return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1), - Val); } } static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { + const ARMSubtarget *Subtarget) { DebugLoc dl = Op.getDebugLoc(); - SDValue Op5 = Op.getOperand(5); - SDValue Res; - unsigned isDeviceBarrier = cast(Op5)->getZExtValue(); - if (isDeviceBarrier) { - if (Subtarget->hasV7Ops()) - Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0)); - else - Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); - } else { - if (Subtarget->hasV7Ops()) - Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); - else - Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), - DAG.getConstant(0, MVT::i32)); + if (!Subtarget->hasDataBarrier()) { + // Some ARMv6 cpus can support data barriers with an mcr instruction. + // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get + // here. + assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && + "Unexpected ISD::MEMBARRIER encountered. Should be libcall!"); + return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); } - return Res; + + SDValue Op5 = Op.getOperand(5); + bool isDeviceBarrier = cast(Op5)->getZExtValue() != 0; + unsigned isLL = cast(Op.getOperand(1))->getZExtValue(); + unsigned isLS = cast(Op.getOperand(2))->getZExtValue(); + bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0); + + ARM_MB::MemBOpt DMBOpt; + if (isDeviceBarrier) + DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY; + else + DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH; + return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), + DAG.getConstant(DMBOpt, MVT::i32)); } -static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, - unsigned VarArgsFrameIndex) { - // vastart just stores the address of the VarArgsFrameIndex slot into the - // memory location argument. +static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + // ARM pre v5TE and Thumb1 does not have preload instructions. + if (!(Subtarget->isThumb2() || + (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) + // Just preserve the chain. + return Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); - SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); - const Value *SV = cast(Op.getOperand(2))->getValue(); - return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, - false, false, 0); + unsigned isRead = ~cast(Op.getOperand(2))->getZExtValue() & 1; + if (!isRead && + (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) + // ARMv7 with MP extension has PLDW. + return Op.getOperand(0); + + if (Subtarget->isThumb()) + // Invert the bits. + isRead = ~isRead & 1; + unsigned isData = Subtarget->isThumb() ? 0 : 1; + + // Currently there is no intrinsic that matches pli. + return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), + Op.getOperand(1), DAG.getConstant(isRead, MVT::i32), + DAG.getConstant(isData, MVT::i32)); } -SDValue -ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); - DebugLoc dl = Node->getDebugLoc(); - EVT VT = Node->getValueType(0); - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - SDValue Align = Op.getOperand(2); - - // Chain the dynamic stack allocation so that it doesn't modify the stack - // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); - - unsigned AlignVal = cast(Align)->getZExtValue(); - unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment(); - if (AlignVal > StackAlign) - // Do this now since selection pass cannot introduce new target - // independent node. - Align = DAG.getConstant(-(uint64_t)AlignVal, VT); - - // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up - // using a "add r, sp, r" instead. Negate the size now so we don't have to - // do even more horrible hack later. +static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); - ARMFunctionInfo *AFI = MF.getInfo(); - if (AFI->isThumb1OnlyFunction()) { - bool Negate = true; - ConstantSDNode *C = dyn_cast(Size); - if (C) { - uint32_t Val = C->getZExtValue(); - if (Val <= 508 && ((Val & 3) == 0)) - Negate = false; - } - if (Negate) - Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size); - } + ARMFunctionInfo *FuncInfo = MF.getInfo(); - SDVTList VTList = DAG.getVTList(VT, MVT::Other); - SDValue Ops1[] = { Chain, Size, Align }; - SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3); - Chain = Res.getValue(1); - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), - DAG.getIntPtrConstant(0, true), SDValue()); - SDValue Ops2[] = { Res, Chain }; - return DAG.getMergeValues(Ops2, 2, dl); + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + DebugLoc dl = Op.getDebugLoc(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + const Value *SV = cast(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); } SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, - DebugLoc dl) { + DebugLoc dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); @@ -1653,12 +2165,12 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue ArgValue2; if (NextVA.isMemLoc()) { MachineFrameInfo *MFI = MF.getFrameInfo(); - int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true, false); + int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); @@ -1674,7 +2186,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, const SmallVectorImpl &Ins, DebugLoc dl, SelectionDAG &DAG, - SmallVectorImpl &InVals) { + SmallVectorImpl &InVals) + const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1708,11 +2221,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, VA = ArgLocs[++i]; // skip ahead to next loc SDValue ArgValue2; if (VA.isMemLoc()) { - int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), - true, false); + int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], @@ -1753,7 +2265,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: - ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, @@ -1776,13 +2288,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); unsigned ArgSize = VA.getLocVT().getSizeInBits()/8; - int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), - true, false); + int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - PseudoSourceValue::getFixedStack(FI), 0, + MachinePointerInfo::getFixedStack(FI), false, false, 0)); } } @@ -1805,10 +2316,12 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // to their spots on the stack so that they may be loaded by deferencing // the result of va_next. AFI->setVarArgsRegSaveSize(VARegSaveSize); - VarArgsFrameIndex = MFI->CreateFixedObject(VARegSaveSize, ArgOffset + - VARegSaveSize - VARegSize, - true, false); - SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); + AFI->setVarArgsFrameIndex( + MFI->CreateFixedObject(VARegSaveSize, + ArgOffset + VARegSaveSize - VARegSize, + false)); + SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), + getPointerTy()); SmallVector MemOps; for (; NumGPRs < 4; ++NumGPRs) { @@ -1820,9 +2333,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - PseudoSourceValue::getFixedStack(VarArgsFrameIndex), 0, - false, false, 0); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()), + false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getConstant(4, getPointerTy())); @@ -1832,7 +2346,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, &MemOps[0], MemOps.size()); } else // This will point to the next argument passed via stack. - VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset, true, false); + AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true)); } return Chain; @@ -1858,7 +2372,8 @@ static bool isFloatingPointZero(SDValue Op) { /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) { + SDValue &ARMcc, SelectionDAG &DAG, + DebugLoc dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { @@ -1867,28 +2382,28 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, default: break; case ISD::SETLT: case ISD::SETGE: - if (isLegalICmpImmediate(C-1)) { + if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(C-1, MVT::i32); } break; case ISD::SETULT: case ISD::SETUGE: - if (C > 0 && isLegalICmpImmediate(C-1)) { + if (C != 0 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; RHS = DAG.getConstant(C-1, MVT::i32); } break; case ISD::SETLE: case ISD::SETGT: - if (isLegalICmpImmediate(C+1)) { + if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(C+1, MVT::i32); } break; case ISD::SETULE: case ISD::SETUGT: - if (C < 0xffffffff && isLegalICmpImmediate(C+1)) { + if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(C+1, MVT::i32); } @@ -1909,13 +2424,14 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, CompareType = ARMISD::CMPZ; break; } - ARMCC = DAG.getConstant(CondCode, MVT::i32); + ARMcc = DAG.getConstant(CondCode, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS); } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. -static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, - DebugLoc dl) { +SDValue +ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, + DebugLoc dl) const { SDValue Cmp; if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS); @@ -1924,7 +2440,53 @@ static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp); } -SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); + SDValue SelectTrue = Op.getOperand(1); + SDValue SelectFalse = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + + // Convert: + // + // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) + // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) + // + if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { + const ConstantSDNode *CMOVTrue = + dyn_cast(Cond.getOperand(0)); + const ConstantSDNode *CMOVFalse = + dyn_cast(Cond.getOperand(1)); + + if (CMOVTrue && CMOVFalse) { + unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); + unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); + + SDValue True; + SDValue False; + if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { + True = SelectTrue; + False = SelectFalse; + } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { + True = SelectFalse; + False = SelectTrue; + } + + if (True.getNode() && False.getNode()) { + EVT VT = Cond.getValueType(); + SDValue ARMcc = Cond.getOperand(2); + SDValue CCR = Cond.getOperand(3); + SDValue Cmp = Cond.getOperand(4); + return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp); + } + } + } + + return DAG.getSelectCC(dl, Cond, + DAG.getConstant(0, Cond.getValueType()), + SelectTrue, SelectFalse, ISD::SETNE); +} + +SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -1934,65 +2496,190 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); if (LHS.getValueType() == MVT::i32) { - SDValue ARMCC; + SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); - return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp); } ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); - SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); - SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, - ARMCC, CCR, Cmp); + ARMcc, CCR, Cmp); if (CondCode2 != ARMCC::AL) { - SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32); + SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); Result = DAG.getNode(ARMISD::CMOV, dl, VT, - Result, TrueVal, ARMCC2, CCR, Cmp2); + Result, TrueVal, ARMcc2, CCR, Cmp2); } return Result; } -SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) { - SDValue Chain = Op.getOperand(0); - ISD::CondCode CC = cast(Op.getOperand(1))->get(); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue Dest = Op.getOperand(4); - DebugLoc dl = Op.getDebugLoc(); +/// canChangeToInt - Given the fp compare operand, return true if it is suitable +/// to morph to an integer compare sequence. +static bool canChangeToInt(SDValue Op, bool &SeenZero, + const ARMSubtarget *Subtarget) { + SDNode *N = Op.getNode(); + if (!N->hasOneUse()) + // Otherwise it requires moving the value from fp to integer registers. + return false; + if (!N->getNumValues()) + return false; + EVT VT = Op.getValueType(); + if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) + // f32 case is generally profitable. f64 case only makes sense when vcmpe + + // vmrs are very slow, e.g. cortex-a8. + return false; - if (LHS.getValueType() == MVT::i32) { - SDValue ARMCC; - SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl); - return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, - Chain, Dest, ARMCC, CCR,Cmp); + if (isFloatingPointZero(Op)) { + SeenZero = true; + return true; + } + return ISD::isNormalLoad(N); +} + +static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { + if (isFloatingPointZero(Op)) + return DAG.getConstant(0, MVT::i32); + + if (LoadSDNode *Ld = dyn_cast(Op)) + return DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, + SDValue &RetVal1, SDValue &RetVal2) { + if (isFloatingPointZero(Op)) { + RetVal1 = DAG.getConstant(0, MVT::i32); + RetVal2 = DAG.getConstant(0, MVT::i32); + return; + } + + if (LoadSDNode *Ld = dyn_cast(Op)) { + SDValue Ptr = Ld->getBasePtr(); + RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), Ptr, + Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + + EVT PtrType = Ptr.getValueType(); + unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); + SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(), + PtrType, Ptr, DAG.getConstant(4, PtrType)); + RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(), + Ld->getChain(), NewPtr, + Ld->getPointerInfo().getWithOffset(4), + Ld->isVolatile(), Ld->isNonTemporal(), + NewAlign); + return; + } + + llvm_unreachable("Unknown VFP cmp argument!"); +} + +/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some +/// f32 and even f64 comparisons to integer ones. +SDValue +ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + bool SeenZero = false; + if (canChangeToInt(LHS, SeenZero, Subtarget) && + canChangeToInt(RHS, SeenZero, Subtarget) && + // If one of the operand is zero, it's safe to ignore the NaN case since + // we only care about equality comparisons. + (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) { + // If unsafe fp math optimization is enabled and there are no othter uses of + // the CMP operands, and the condition code is EQ oe NE, we can optimize it + // to an integer comparison. + if (CC == ISD::SETOEQ) + CC = ISD::SETEQ; + else if (CC == ISD::SETUNE) + CC = ISD::SETNE; + + SDValue ARMcc; + if (LHS.getValueType() == MVT::f32) { + LHS = bitcastf32Toi32(LHS, DAG); + RHS = bitcastf32Toi32(RHS, DAG); + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + SDValue LHS1, LHS2; + SDValue RHS1, RHS2; + expandf64Toi32(LHS, DAG, LHS1, LHS2); + expandf64Toi32(RHS, DAG, RHS1, RHS2); + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMcc = DAG.getConstant(CondCode, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; + return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7); + } + + return SDValue(); +} + +SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + DebugLoc dl = Op.getDebugLoc(); + + if (LHS.getValueType() == MVT::i32) { + SDValue ARMcc; + SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, + Chain, Dest, ARMcc, CCR, Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + if (UnsafeFPMath && + (CC == ISD::SETEQ || CC == ISD::SETOEQ || + CC == ISD::SETNE || CC == ISD::SETUNE)) { + SDValue Result = OptimizeVFPBrcond(Op, DAG); + if (Result.getNode()) + return Result; } - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); ARMCC::CondCodes CondCode, CondCode2; FPCCToARMCC(CC, CondCode, CondCode2); + SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); - SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp }; + SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); if (CondCode2 != ARMCC::AL) { - ARMCC = DAG.getConstant(CondCode2, MVT::i32); - SDValue Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) }; + ARMcc = DAG.getConstant(CondCode2, MVT::i32); + SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5); } return Res; } -SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); @@ -2016,14 +2703,14 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) { } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, - PseudoSourceValue::getJumpTable(), 0, + MachinePointerInfo::getJumpTable(), false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, - PseudoSourceValue::getJumpTable(), 0, false, false, 0); + MachinePointerInfo::getJumpTable(), false, false, 0); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId); } @@ -2044,7 +2731,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) { break; } Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0)); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op); + return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op); } static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2063,11 +2750,11 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) { break; } - Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0)); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0)); return DAG.getNode(Opc, dl, VT, Op); } -static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); @@ -2075,15 +2762,38 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0); - SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl); - SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32); + SDValue ARMcc = DAG.getConstant(ARMCC::LT, MVT::i32); + SDValue FP0 = DAG.getConstantFP(0.0, SrcVT); + SDValue Cmp = getVFPCmp(Tmp1, FP0, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); - return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp); + return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMcc, CCR, Cmp); } -SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(4, MVT::i32); + return DAG.getLoad(VT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); +} + +SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setFrameAddressIsTaken(true); + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); @@ -2091,197 +2801,69 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { ? ARM::R7 : ARM::R11; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) - FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), false, false, 0); return FrameAddr; } -SDValue -ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - const Value *DstSV, uint64_t DstSVOff, - const Value *SrcSV, uint64_t SrcSVOff){ - // Do repeated 4-byte loads and stores. To be improved. - // This requires 4-byte alignment. - if ((Align & 3) != 0) - return SDValue(); - // This requires the copy size to be a constant, preferrably - // within a subtarget-specific limit. - ConstantSDNode *ConstantSize = dyn_cast(Size); - if (!ConstantSize) - return SDValue(); - uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) - return SDValue(); - - unsigned BytesLeft = SizeVal & 3; - unsigned NumMemOps = SizeVal >> 2; - unsigned EmittedNumMemOps = 0; - EVT VT = MVT::i32; - unsigned VTSize = 4; - unsigned i = 0; - const unsigned MAX_LOADS_IN_LDM = 6; - SDValue TFOps[MAX_LOADS_IN_LDM]; - SDValue Loads[MAX_LOADS_IN_LDM]; - uint64_t SrcOff = 0, DstOff = 0; - - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, MVT::i32)), - SrcSV, SrcSVOff + SrcOff, isVolatile, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, MVT::i32)), - DstSV, DstSVOff + DstOff, isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - EmittedNumMemOps += i; - } - - if (BytesLeft == 0) - return Chain; - - // Issue loads / stores for the trailing (1 - 3) bytes. - unsigned BytesLeftSave = BytesLeft; - i = 0; - while (BytesLeft) { - if (BytesLeft >= 2) { - VT = MVT::i16; - VTSize = 2; - } else { - VT = MVT::i8; - VTSize = 1; - } - - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, MVT::i32)), - SrcSV, SrcSVOff + SrcOff, false, false, 0); - TFOps[i] = Loads[i].getValue(1); - ++i; - SrcOff += VTSize; - BytesLeft -= VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); - - i = 0; - BytesLeft = BytesLeftSave; - while (BytesLeft) { - if (BytesLeft >= 2) { - VT = MVT::i16; - VTSize = 2; - } else { - VT = MVT::i8; - VTSize = 1; - } - - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, MVT::i32)), - DstSV, DstSVOff + DstOff, false, false, 0); - ++i; - DstOff += VTSize; - BytesLeft -= VTSize; - } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); -} - -static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) { +/// ExpandBITCAST - If the target supports VFP, this function is called to +/// expand a bit convert where either the source or destination type is i64 to +/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 +/// operand type is illegal (e.g., v2f32 for a target that doesn't support +/// vectors), since the legalizer won't know what to do with that. +static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + DebugLoc dl = N->getDebugLoc(); SDValue Op = N->getOperand(0); - // Do not create a VMOVDRR or VMOVRRD node if the operand type is not - // legal. The legalizer won't know what to do with that. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isTypeLegal(Op.getValueType())) - return SDValue(); + // This function is only supposed to be called for i64 types, either as the + // source or destination of the bit convert. + EVT SrcVT = Op.getValueType(); + EVT DstVT = N->getValueType(0); + assert((SrcVT == MVT::i64 || DstVT == MVT::i64) && + "ExpandBITCAST called for non-i64 type"); - DebugLoc dl = N->getDebugLoc(); - if (N->getValueType(0) == MVT::f64) { - // Turn i64->f64 into VMOVDRR. + // Turn i64->f64 into VMOVDRR. + if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, MVT::i32)); - return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); + return DAG.getNode(ISD::BITCAST, dl, DstVT, + DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } // Turn f64->i64 into VMOVRRD. - SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, - DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); + if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { + SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), &Op, 1); + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + } - // Merge the pieces into a single i64 value. - return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); + return SDValue(); } /// getZeroVector - Returns a vector of specified type with all zero elements. -/// +/// Zero vectors are used to represent vector negation and in those cases +/// will be implemented with the NEON VNEG instruction. However, VNEG does +/// not support i64 elements, so sometimes the zero vectors will need to be +/// explicitly constructed. Regardless, use a canonical VMOV to create the +/// zero vector. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - - // Zero vectors are used to represent vector negation and in those cases - // will be implemented with the NEON VNEG instruction. However, VNEG does - // not support i64 elements, so sometimes the zero vectors will need to be - // explicitly constructed. For those cases, and potentially other uses in - // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted - // to their dest type. This ensures they get CSE'd. - SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0, MVT::i8); - SmallVector Ops; - MVT TVT; - - if (VT.getSizeInBits() == 64) { - Ops.assign(8, Cst); TVT = MVT::v8i8; - } else { - Ops.assign(16, Cst); TVT = MVT::v16i8; - } - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); - - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); -} - -/// getOnesVector - Returns a vector of specified type with all bits set. -/// -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { - assert(VT.isVector() && "Expected a vector type"); - - // Always build ones vectors as <16 x i8> or <8 x i8> bitcasted to their - // dest type. This ensures they get CSE'd. - SDValue Vec; - SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8); - SmallVector Ops; - MVT TVT; - - if (VT.getSizeInBits() == 64) { - Ops.assign(8, Cst); TVT = MVT::v8i8; - } else { - Ops.assign(16, Cst); TVT = MVT::v16i8; - } - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size()); - - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + // The canonical modified immediate encoding of a zero vector is....0! + SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32); + EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. -SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -2289,7 +2871,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) { SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMCC; + SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); @@ -2305,9 +2887,9 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) { SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMCC, DAG, dl); + ARMcc, DAG, dl); SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, + SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; @@ -2316,7 +2898,8 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) { /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. -SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -2324,7 +2907,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) { SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMCC; + SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, @@ -2338,15 +2921,33 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) { SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE, - ARMCC, DAG, dl); + ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMCC, + SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc, CCR, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, 2, dl); } +SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + // The rounding mode is in bits 23:22 of the FPSCR. + // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 + // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) + // so that the shift + and get folded into a bitfield extract. + DebugLoc dl = Op.getDebugLoc(); + SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, + DAG.getConstant(Intrinsic::arm_get_fpscr, + MVT::i32)); + SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, + DAG.getConstant(1U << 22, MVT::i32)); + SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, + DAG.getConstant(22, MVT::i32)); + return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, MVT::i32)); +} + static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); @@ -2364,33 +2965,40 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); DebugLoc dl = N->getDebugLoc(); + if (!VT.isVector()) + return SDValue(); + // Lower vector shifts on NEON to use VSHL. - if (VT.isVector()) { - assert(ST->hasNEON() && "unexpected vector shift"); - - // Left shifts translate directly to the vshiftu intrinsic. - if (N->getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), - N->getOperand(0), N->getOperand(1)); - - assert((N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); - - // NEON uses the same intrinsics for both left and right shifts. For - // right shifts, the shift amounts are negative, so negate the vector of - // shift amounts. - EVT ShiftVT = N->getOperand(1).getValueType(); - SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, - getZeroVector(ShiftVT, DAG, dl), - N->getOperand(1)); - Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? - Intrinsic::arm_neon_vshifts : - Intrinsic::arm_neon_vshiftu); + assert(ST->hasNEON() && "unexpected vector shift"); + + // Left shifts translate directly to the vshiftu intrinsic. + if (N->getOpcode() == ISD::SHL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, MVT::i32), - N->getOperand(0), NegatedCount); - } + DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32), + N->getOperand(0), N->getOperand(1)); + + assert((N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + + // NEON uses the same intrinsics for both left and right shifts. For + // right shifts, the shift amounts are negative, so negate the vector of + // shift amounts. + EVT ShiftVT = N->getOperand(1).getValueType(); + SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, + getZeroVector(ShiftVT, DAG, dl), + N->getOperand(1)); + Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? + Intrinsic::arm_neon_vshifts : + Intrinsic::arm_neon_vshiftu); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(vshiftInt, MVT::i32), + N->getOperand(0), NegatedCount); +} + +static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) @@ -2409,9 +3017,9 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), - DAG.getConstant(0, MVT::i32)); + DAG.getConstant(0, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, MVT::i32)); // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. @@ -2502,13 +3110,13 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { AndOp = Op1; // Ignore bitconvert. - if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT) + if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Opc = ARMISD::VTST; - Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0)); - Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1)); + Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0)); + Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1)); Invert = !Invert; } } @@ -2517,7 +3125,38 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { if (Swap) std::swap(Op0, Op1); - SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + // If one of the operands is a constant vector zero, attempt to fold the + // comparison to a specialized compare-against-zero form. + SDValue SingleOp; + if (ISD::isBuildVectorAllZeros(Op1.getNode())) + SingleOp = Op0; + else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { + if (Opc == ARMISD::VCGE) + Opc = ARMISD::VCLEZ; + else if (Opc == ARMISD::VCGT) + Opc = ARMISD::VCLTZ; + SingleOp = Op1; + } + + SDValue Result; + if (SingleOp.getNode()) { + switch (Opc) { + case ARMISD::VCEQ: + Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break; + case ARMISD::VCGE: + Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break; + case ARMISD::VCLEZ: + Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break; + case ARMISD::VCGT: + Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break; + case ARMISD::VCLTZ: + Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break; + default: + Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + } + } else { + Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + } if (Invert) Result = DAG.getNOT(dl, Result, VT); @@ -2525,95 +3164,152 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { return Result; } -/// isVMOVSplat - Check if the specified splat value corresponds to an immediate -/// VMOV instruction, and if so, return the constant being splatted. -static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef, - unsigned SplatBitSize, SelectionDAG &DAG) { +/// isNEONModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON instruction with a "modified immediate" +/// operand (e.g., VMOV). If so, return the encoded value. +static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + EVT &VT, bool is128Bits, NEONModImmType type) { + unsigned OpCmode, Imm; + + // SplatBitSize is set to the smallest size that splats the vector, so a + // zero vector will always have SplatBitSize == 8. However, NEON modified + // immediate instructions others than VMOV do not support the 8-bit encoding + // of a zero vector, and the default encoding of zero is supposed to be the + // 32-bit version. + if (SplatBits == 0) + SplatBitSize = 32; + switch (SplatBitSize) { case 8: - // Any 1-byte value is OK. + if (type != VMOVModImm) + return SDValue(); + // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); - return DAG.getTargetConstant(SplatBits, MVT::i8); + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. - if ((SplatBits & ~0xff) == 0 || - (SplatBits & ~0xff00) == 0) - return DAG.getTargetConstant(SplatBits, MVT::i16); - break; + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn: Op=x, Cmode=100x. + OpCmode = 0x8; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00: Op=x, Cmode=101x. + OpCmode = 0xa; + Imm = SplatBits >> 8; + break; + } + return SDValue(); case 32: // NEON's 32-bit VMOV supports splat values where: // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. - if ((SplatBits & ~0xff) == 0 || - (SplatBits & ~0xff00) == 0 || - (SplatBits & ~0xff0000) == 0 || - (SplatBits & ~0xff000000) == 0) - return DAG.getTargetConstant(SplatBits, MVT::i32); + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn: Op=x, Cmode=000x. + OpCmode = 0; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00: Op=x, Cmode=001x. + OpCmode = 0x2; + Imm = SplatBits >> 8; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000: Op=x, Cmode=010x. + OpCmode = 0x4; + Imm = SplatBits >> 16; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000: Op=x, Cmode=011x. + OpCmode = 0x6; + Imm = SplatBits >> 24; + break; + } + + // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC + if (type == OtherModImm) return SDValue(); if ((SplatBits & ~0xffff) == 0 && - ((SplatBits | SplatUndef) & 0xff) == 0xff) - return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32); + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff: Op=x, Cmode=1100. + OpCmode = 0xc; + Imm = SplatBits >> 8; + SplatBits |= 0xff; + break; + } if ((SplatBits & ~0xffffff) == 0 && - ((SplatBits | SplatUndef) & 0xffff) == 0xffff) - return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32); + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff: Op=x, Cmode=1101. + OpCmode = 0xd; + Imm = SplatBits >> 16; + SplatBits |= 0xffff; + break; + } // Note: there are a few 32-bit splat values (specifically: 00ffff00, // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not // VMOV.I32. A (very) minor optimization would be to replicate the value // and fall through here to test for a valid 64-bit splat. But, then the // caller would also need to check and handle the change in size. - break; + return SDValue(); case 64: { + if (type != VMOVModImm) + return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; uint64_t Val = 0; + unsigned ImmMask = 1; + Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { - if (((SplatBits | SplatUndef) & BitMask) == BitMask) + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { Val |= BitMask; - else if ((SplatBits & BitMask) != 0) + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { return SDValue(); + } BitMask <<= 8; + ImmMask <<= 1; } - return DAG.getTargetConstant(Val, MVT::i64); - } - - default: - llvm_unreachable("unexpected size for isVMOVSplat"); + // Op=1, Cmode=1110. + OpCmode = 0x1e; + SplatBits = Val; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } - return SDValue(); -} - -/// getVMOVImm - If this is a build_vector of constants which can be -/// formed by using a VMOV instruction of the specified element size, -/// return the constant being splatted. The ByteSize field indicates the -/// number of bytes of each element [1248]. -SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { - BuildVectorSDNode *BVN = dyn_cast(N); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ByteSize * 8)) - return SDValue(); - - if (SplatBitSize > ByteSize * 8) + default: + llvm_unreachable("unexpected size for isNEONModifiedImm"); return SDValue(); + } - return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG); + unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); + return DAG.getTargetConstant(EncodedVal, MVT::i32); } static bool isVEXTMask(const SmallVectorImpl &M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); ReverseVEXT = false; + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first @@ -2629,6 +3325,7 @@ static bool isVEXTMask(const SmallVectorImpl &M, EVT VT, ReverseVEXT = true; } + if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } @@ -2654,13 +3351,16 @@ static bool isVREVMask(const SmallVectorImpl &M, EVT VT, unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { - if ((unsigned) M[i] != - (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) + if (M[i] < 0) continue; // ignore UNDEF indices + if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) return false; } @@ -2676,8 +3376,8 @@ static bool isVTRNMask(const SmallVectorImpl &M, EVT VT, unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { - if ((unsigned) M[i] != i + WhichResult || - (unsigned) M[i+1] != i + NumElts + WhichResult) + if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || + (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) return false; } return true; @@ -2695,8 +3395,8 @@ static bool isVTRN_v_undef_Mask(const SmallVectorImpl &M, EVT VT, unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { - if ((unsigned) M[i] != i + WhichResult || - (unsigned) M[i+1] != i + WhichResult) + if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || + (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) return false; } return true; @@ -2711,6 +3411,7 @@ static bool isVUZPMask(const SmallVectorImpl &M, EVT VT, unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i != NumElts; ++i) { + if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned) M[i] != 2 * i + WhichResult) return false; } @@ -2736,7 +3437,8 @@ static bool isVUZP_v_undef_Mask(const SmallVectorImpl &M, EVT VT, for (unsigned j = 0; j != 2; ++j) { unsigned Idx = WhichResult; for (unsigned i = 0; i != Half; ++i) { - if ((unsigned) M[i + j * Half] != Idx) + int MIdx = M[i + j * Half]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) return false; Idx += 2; } @@ -2759,8 +3461,8 @@ static bool isVZIPMask(const SmallVectorImpl &M, EVT VT, WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { - if ((unsigned) M[i] != Idx || - (unsigned) M[i+1] != Idx + NumElts) + if ((M[i] >= 0 && (unsigned) M[i] != Idx) || + (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) return false; Idx += 1; } @@ -2785,8 +3487,8 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl &M, EVT VT, WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { - if ((unsigned) M[i] != Idx || - (unsigned) M[i+1] != Idx) + if ((M[i] >= 0 && (unsigned) M[i] != Idx) || + (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) return false; Idx += 1; } @@ -2798,46 +3500,30 @@ static bool isVZIP_v_undef_Mask(const SmallVectorImpl &M, EVT VT, return true; } +// If N is an integer constant that can be moved into a register in one +// instruction, return an SDValue of such a constant (will become a MOV +// instruction). Otherwise return null. +static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, + const ARMSubtarget *ST, DebugLoc dl) { + uint64_t Val; + if (!isa(N)) + return SDValue(); + Val = cast(N)->getZExtValue(); -static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) { - // Canonicalize all-zeros and all-ones vectors. - ConstantSDNode *ConstVal = cast(Val.getNode()); - if (ConstVal->isNullValue()) - return getZeroVector(VT, DAG, dl); - if (ConstVal->isAllOnesValue()) - return getOnesVector(VT, DAG, dl); - - EVT CanonicalVT; - if (VT.is64BitVector()) { - switch (Val.getValueType().getSizeInBits()) { - case 8: CanonicalVT = MVT::v8i8; break; - case 16: CanonicalVT = MVT::v4i16; break; - case 32: CanonicalVT = MVT::v2i32; break; - case 64: CanonicalVT = MVT::v1i64; break; - default: llvm_unreachable("unexpected splat element type"); break; - } + if (ST->isThumb1Only()) { + if (Val <= 255 || ~Val <= 255) + return DAG.getConstant(Val, MVT::i32); } else { - assert(VT.is128BitVector() && "unknown splat vector size"); - switch (Val.getValueType().getSizeInBits()) { - case 8: CanonicalVT = MVT::v16i8; break; - case 16: CanonicalVT = MVT::v8i16; break; - case 32: CanonicalVT = MVT::v4i32; break; - case 64: CanonicalVT = MVT::v2i64; break; - default: llvm_unreachable("unexpected splat element type"); break; - } + if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) + return DAG.getConstant(Val, MVT::i32); } - - // Build a canonical splat for this value. - SmallVector Ops; - Ops.assign(CanonicalVT.getVectorNumElements(), Val); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0], - Ops.size()); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res); + return SDValue(); } // If this is a case we can't handle, return null and let the default // expansion code take care of it. -static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { BuildVectorSDNode *BVN = cast(Op.getNode()); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -2847,28 +3533,100 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { - SDValue Val = isVMOVSplat(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, DAG); + // Check if an immediate VMOV works. + EVT VmovVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), + VMOVModImm); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); + } + + // Try an immediate VMVN. + uint64_t NegatedImm = (SplatBits.getZExtValue() ^ + ((1LL << SplatBitSize) - 1)); + Val = isNEONModifiedImm(NegatedImm, + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VmovVT, VT.is128BitVector(), + VMVNModImm); + if (Val.getNode()) { + SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); + } + } + } + + // Scan through the operands to see if only one value is used. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool isConstant = true; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa(V) && !isa(V)) + isConstant = false; + + if (!Value.getNode()) + Value = V; + else if (V != Value) + usesOnlyOneValue = false; + } + + if (!Value.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Use VDUP for non-constant splats. For f32 constant splats, reduce to + // i32 and try again. + if (usesOnlyOneValue && EltSize <= 32) { + if (!isConstant) + return DAG.getNode(ARMISD::VDUP, dl, VT, Value); + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts); + Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) - return BuildSplat(Val, VT, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); } + SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); + if (Val.getNode()) + return DAG.getNode(ARMISD::VDUP, dl, VT, Val); } - // If there are only 2 elements in a 128-bit vector, insert them into an - // undef vector. This handles the common case for 128-bit vector argument - // passing, where the insertions should be translated to subreg accesses - // with no real instructions. - if (VT.is128BitVector() && Op.getNumOperands() == 2) { - SDValue Val = DAG.getUNDEF(VT); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - if (Op0.getOpcode() != ISD::UNDEF) - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0, - DAG.getIntPtrConstant(0)); - if (Op1.getOpcode() != ISD::UNDEF) - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1, - DAG.getIntPtrConstant(1)); - return Val; + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Vectors with 32- or 64-bit elements can be built by directly assigning + // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands + // will be legalized. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); } return SDValue(); @@ -2904,7 +3662,9 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, bool ReverseVEXT; unsigned Imm, WhichResult; - return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + return (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || @@ -3002,64 +3762,67 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // of the same time so that they get CSEd properly. SVN->getMask(ShuffleMask); - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { - int Lane = SVN->getSplatIndex(); - // If this is undef splat, generate it via "just" vdup, if possible. - if (Lane == -1) Lane = 0; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize <= 32) { + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; - if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { - return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); + } + return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i32)); } - return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, - DAG.getConstant(Lane, MVT::i32)); - } - bool ReverseVEXT; - unsigned Imm; - if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { - if (ReverseVEXT) - std::swap(V1, V2); - return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, - DAG.getConstant(Imm, MVT::i32)); - } - - if (isVREVMask(ShuffleMask, VT, 64)) - return DAG.getNode(ARMISD::VREV64, dl, VT, V1); - if (isVREVMask(ShuffleMask, VT, 32)) - return DAG.getNode(ARMISD::VREV32, dl, VT, V1); - if (isVREVMask(ShuffleMask, VT, 16)) - return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - - // Check for Neon shuffles that modify both input vectors in place. - // If both results are used, i.e., if there are two shuffles with the same - // source operands and with masks corresponding to both results of one of - // these operations, DAG memoization will ensure that a single node is - // used for both shuffles. - unsigned WhichResult; - if (isVTRNMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVUZPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); - if (isVZIPMask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V2).getValue(WhichResult); + bool ReverseVEXT; + unsigned Imm; + if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + if (ReverseVEXT) + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } - if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); - if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) - return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), - V1, V1).getValue(WhichResult); + if (isVREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(ARMISD::VREV64, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(ARMISD::VREV32, dl, VT, V1); + if (isVREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V1).getValue(WhichResult); + } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. - if (VT.getVectorNumElements() == 4 && - (VT.is128BitVector() || VT.is64BitVector())) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) @@ -3071,7 +3834,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); @@ -3079,18 +3841,45 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } + // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. + if (EltSize >= 32) { + // Do the expansion with floating-point types, since that is what the VFP + // registers are defined to use, and since i64 is not legal. + EVT EltVT = EVT::getFloatingPointVT(EltSize); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); + V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) { + if (ShuffleMask[i] < 0) + Ops.push_back(DAG.getUNDEF(EltVT)); + else + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + ShuffleMask[i] < (int)NumElts ? V1 : V2, + DAG.getConstant(ShuffleMask[i] & (NumElts-1), + MVT::i32))); + } + SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts); + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + return SDValue(); } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - DebugLoc dl = Op.getDebugLoc(); - SDValue Vec = Op.getOperand(0); + // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); - assert(VT == MVT::i32 && - Vec.getValueType().getVectorElementType().getSizeInBits() < 32 && - "unexpected type for custom-lowering vector extract"); - return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + if (!isa(Lane)) + return SDValue(); + + SDValue Vec = Op.getOperand(0); + if (Op.getValueType() == MVT::i32 && + Vec.getValueType().getVectorElementType().getSizeInBits() < 32) { + DebugLoc dl = Op.getDebugLoc(); + return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); + } + + return Op; } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { @@ -3104,16 +3893,156 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0), + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), DAG.getIntPtrConstant(0)); if (Op1.getOpcode() != ISD::UNDEF) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1), + DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), DAG.getIntPtrConstant(1)); - return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); +} + +/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each +/// element has been zero/sign-extended, depending on the isSigned parameter, +/// from an integer type half its size. +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. + EVT VT = N->getValueType(0); + if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + if (BVN->getValueType(0) != MVT::v4i32 || + BVN->getOpcode() != ISD::BUILD_VECTOR) + return false; + unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + unsigned HiElt = 1 - LoElt; + ConstantSDNode *Lo0 = dyn_cast(BVN->getOperand(LoElt)); + ConstantSDNode *Hi0 = dyn_cast(BVN->getOperand(HiElt)); + ConstantSDNode *Lo1 = dyn_cast(BVN->getOperand(LoElt+2)); + ConstantSDNode *Hi1 = dyn_cast(BVN->getOperand(HiElt+2)); + if (!Lo0 || !Hi0 || !Lo1 || !Hi1) + return false; + if (isSigned) { + if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && + Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) + return true; + } else { + if (Hi0->isNullValue() && Hi1->isNullValue()) + return true; + } + return false; + } + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ConstantSDNode *C = dyn_cast(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + int64_t SExtVal = C->getSExtValue(); + if ((SExtVal >> HalfSize) != (SExtVal >> EltSize)) + return false; + } else { + if ((C->getZExtValue() >> HalfSize) != 0) + return false; + } + continue; + } + return false; + } + + return true; } -SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { +/// isSignExtended - Check if a node is a vector value that is sign-extended +/// or a constant BUILD_VECTOR with sign-extended elements. +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +/// isZeroExtended - Check if a node is a vector value that is zero-extended +/// or a constant BUILD_VECTOR with zero-extended elements. +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; +} + +/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending +/// load, or BUILD_VECTOR with extended elements, return the unextended value. +static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + return N->getOperand(0); + if (LoadSDNode *LD = dyn_cast(N)) + return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); + // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will + // have been legalized as a BITCAST from v4i32. + if (N->getOpcode() == ISD::BITCAST) { + SDNode *BVN = N->getOperand(0).getNode(); + assert(BVN->getOpcode() == ISD::BUILD_VECTOR && + BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); + unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0; + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32, + BVN->getOperand(LowElt), BVN->getOperand(LowElt+2)); + } + // Construct a new BUILD_VECTOR with elements truncated to half the size. + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector Ops; + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT)); + } + return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), + MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts); +} + +static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { + // Multiplications are only custom-lowered for 128-bit vectors so that + // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL"); + SDNode *N0 = Op.getOperand(0).getNode(); + SDNode *N1 = Op.getOperand(1).getNode(); + unsigned NewOpc = 0; + if (isSignExtended(N0, DAG) && isSignExtended(N1, DAG)) + NewOpc = ARMISD::VMULLs; + else if (isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG)) + NewOpc = ARMISD::VMULLu; + else if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + + // Legalize to a VMULL instruction. + DebugLoc DL = Op.getDebugLoc(); + SDValue Op0 = SkipExtension(N0, DAG); + SDValue Op1 = SkipExtension(N1, DAG); + + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); +} + +SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); @@ -3122,23 +4051,27 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : LowerGlobalAddressELF(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); - case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); - case ISD::VASTART: return LowerVASTART(Op, DAG, VarArgsFrameIndex); + case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, DAG, Subtarget); + case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); - case ISD::RETURNADDR: break; + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); - case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG); + case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); @@ -3147,10 +4080,12 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::VSETCC: return LowerVSETCC(Op, DAG); - case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); } return SDValue(); } @@ -3159,18 +4094,18 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, - SelectionDAG &DAG) { + SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); break; - case ISD::BIT_CONVERT: - Res = ExpandBIT_CONVERT(N, DAG); + case ISD::BITCAST: + Res = ExpandBITCAST(N, DAG); break; case ISD::SRL: case ISD::SRA: - Res = LowerShift(N, DAG, Subtarget); + Res = Expand64BitShift(N, DAG, Subtarget); break; } if (Res.getNode()) @@ -3223,7 +4158,12 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, MF->insert(It, loop1MBB); MF->insert(It, loop2MBB); MF->insert(It, exitMBB); - exitMBB->transferSuccessors(BB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); // thisMBB: // ... @@ -3261,7 +4201,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI, // ... BB = exitMBB; - MF->DeleteMachineInstr(MI); // The instruction is gone now. + MI->eraseFromParent(); // The instruction is gone now. return BB; } @@ -3304,7 +4244,12 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, loopMBB); MF->insert(It, exitMBB); - exitMBB->transferSuccessors(BB); + + // Transfer the remainder of BB and its successor edges to exitMBB. + exitMBB->splice(exitMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); MachineRegisterInfo &RegInfo = MF->getRegInfo(); unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass); @@ -3349,15 +4294,23 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, // ... BB = exitMBB; - MF->DeleteMachineInstr(MI); // The instruction is gone now. + MI->eraseFromParent(); // The instruction is gone now. return BB; } +static +MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I != Succ) + return *I; + llvm_unreachable("Expecting a BB with two successors!"); +} + MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB, - DenseMap *EM) const { + MachineBasicBlock *BB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); DebugLoc dl = MI->getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); @@ -3435,25 +4388,21 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) - .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); F->insert(It, copy0MBB); F->insert(It, sinkMBB); - // Update machine-CFG edges by first adding all successors of the current - // block to the new block which will contain the Phi node for the select. - // Also inform sdisel of the edge changes. - for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), - E = BB->succ_end(); I != E; ++I) { - EM->insert(std::make_pair(*I, sinkMBB)); - sinkMBB->addSuccessor(*I); - } - // Next, remove all successors of the current block, and add the true - // and fallthrough blocks as its successors. - while (!BB->succ_empty()) - BB->removeSuccessor(BB->succ_begin()); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); + BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB) + .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -3466,83 +4415,52 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; - BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg()) + BuildMI(*BB, BB->begin(), dl, + TII->get(ARM::PHI), MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); - F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } - case ARM::tANDsp: - case ARM::tADDspr_: - case ARM::tSUBspi_: - case ARM::t2SUBrSPi_: - case ARM::t2SUBrSPi12_: - case ARM::t2SUBrSPs_: { - MachineFunction *MF = BB->getParent(); - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); - bool DstIsDead = MI->getOperand(0).isDead(); - bool SrcIsKill = MI->getOperand(1).isKill(); - - if (SrcReg != ARM::SP) { - // Copy the source to SP from virtual register. - const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg); - unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) - ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr; - BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP) - .addReg(SrcReg, getKillRegState(SrcIsKill)); + case ARM::BCCi64: + case ARM::BCCZi64: { + // Compare both parts that make up the double comparison separately for + // equality. + bool RHSisZero = MI->getOpcode() == ARM::BCCZi64; + + unsigned LHS1 = MI->getOperand(1).getReg(); + unsigned LHS2 = MI->getOperand(2).getReg(); + if (RHSisZero) { + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS1).addImm(0)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) + .addReg(LHS2).addImm(0) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); + } else { + unsigned RHS1 = MI->getOperand(3).getReg(); + unsigned RHS2 = MI->getOperand(4).getReg(); + AddDefaultPred(BuildMI(BB, dl, + TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS1).addReg(RHS1)); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) + .addReg(LHS2).addReg(RHS2) + .addImm(ARMCC::EQ).addReg(ARM::CPSR); } - unsigned OpOpc = 0; - bool NeedPred = false, NeedCC = false, NeedOp3 = false; - switch (MI->getOpcode()) { - default: - llvm_unreachable("Unexpected pseudo instruction!"); - case ARM::tANDsp: - OpOpc = ARM::tAND; - NeedPred = true; - break; - case ARM::tADDspr_: - OpOpc = ARM::tADDspr; - break; - case ARM::tSUBspi_: - OpOpc = ARM::tSUBspi; - break; - case ARM::t2SUBrSPi_: - OpOpc = ARM::t2SUBrSPi; - NeedPred = true; NeedCC = true; - break; - case ARM::t2SUBrSPi12_: - OpOpc = ARM::t2SUBrSPi12; - NeedPred = true; - break; - case ARM::t2SUBrSPs_: - OpOpc = ARM::t2SUBrSPs; - NeedPred = true; NeedCC = true; NeedOp3 = true; - break; - } - MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP); - if (OpOpc == ARM::tAND) - AddDefaultT1CC(MIB); - MIB.addReg(ARM::SP); - MIB.addOperand(MI->getOperand(2)); - if (NeedOp3) - MIB.addOperand(MI->getOperand(3)); - if (NeedPred) - AddDefaultPred(MIB); - if (NeedCC) - AddDefaultCC(MIB); - - // Copy the result from SP to virtual register. - const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg); - unsigned CopyOpc = (RC == ARM::tGPRRegisterClass) - ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr; - BuildMI(BB, dl, TII->get(CopyOpc)) - .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)) - .addReg(ARM::SP); - MF->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB(); + MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); + if (MI->getOperand(0).getImm() == ARMCC::NE) + std::swap(destMBB, exitMBB); + + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) + .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); + BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2B : ARM::B)) + .addMBB(exitMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. return BB; } } @@ -3612,30 +4530,42 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, return SDValue(); } -/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. -static SDValue PerformADDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // added by evan in r37685 with no testcase. - SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); - +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI) { // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) { SDValue Result = combineSelectAndUse(N, N0, N1, DCI); if (Result.getNode()) return Result; } - if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) { - SDValue Result = combineSelectAndUse(N, N1, N0, DCI); - if (Result.getNode()) return Result; - } - return SDValue(); } +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI); + if (Result.getNode()) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI); +} + /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. +/// static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { - // added by evan in r37685 with no testcase. - SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) { @@ -3646,23 +4576,426 @@ static SDValue PerformSUBCombine(SDNode *N, return SDValue(); } +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + + if (Subtarget->isThumb1Only()) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + ConstantSDNode *C = dyn_cast(N->getOperand(1)); + if (!C) + return SDValue(); + + uint64_t MulAmt = C->getZExtValue(); + unsigned ShiftAmt = CountTrailingZeros_64(MulAmt); + ShiftAmt = ShiftAmt & (32 - 1); + SDValue V = N->getOperand(0); + DebugLoc DL = N->getDebugLoc(); + + SDValue Res; + MulAmt >>= ShiftAmt; + if (isPowerOf2_32(MulAmt - 1)) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + Res = DAG.getNode(ISD::ADD, DL, VT, + V, DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt-1), + MVT::i32))); + } else if (isPowerOf2_32(MulAmt + 1)) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + Res = DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, + V, DAG.getConstant(Log2_32(MulAmt+1), + MVT::i32)), + V); + } else + return SDValue(); + + if (ShiftAmt != 0) + Res = DAG.getNode(ISD::SHL, DL, VT, Res, + DAG.getConstant(ShiftAmt, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + return SDValue(); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Attempt to use immediate-form VBIC + BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VbicVT; + SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VbicVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); + SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); + } + } + } + + return SDValue(); +} + +/// PerformORCombine - Target-specific dag combine xforms for ISD::OR +static SDValue PerformORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Attempt to use immediate-form VORR + BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && Subtarget->hasNEON() && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VorrVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, VorrVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); + SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); + } + } + } + + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when + // reasonable. + + // BFI is only available on V6T2+ + if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) + return SDValue(); + + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + // 1) or (and A, mask), val => ARMbfi A, val, mask + // iff (val & mask) == val + // + // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) + // && CountPopulation_32(mask) == CountPopulation_32(~mask2) + // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) + // && CountPopulation_32(mask) == CountPopulation_32(~mask2) + // (i.e., copy a bitfield value into another bitfield of the same width) + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + if (VT != MVT::i32) + return SDValue(); + + + // The value and the mask need to be constants so we can verify this is + // actually a bitfield set. If the mask is 0xffff, we can do better + // via a movt instruction, so don't use BFI in that case. + ConstantSDNode *C = dyn_cast(N0.getOperand(1)); + if (!C) + return SDValue(); + unsigned Mask = C->getZExtValue(); + if (Mask == 0xffff) + return SDValue(); + SDValue Res; + // Case (1): or (and A, mask), val => ARMbfi A, val, mask + if ((C = dyn_cast(N1))) { + unsigned Val = C->getZExtValue(); + if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val) + return SDValue(); + Val >>= CountTrailingZeros_32(~Mask); + + Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), + DAG.getConstant(Val, MVT::i32), + DAG.getConstant(Mask, MVT::i32)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } else if (N1.getOpcode() == ISD::AND) { + // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask + C = dyn_cast(N1.getOperand(1)); + if (!C) + return SDValue(); + unsigned Mask2 = C->getZExtValue(); + + if (ARM::isBitFieldInvertedMask(Mask) && + ARM::isBitFieldInvertedMask(~Mask2) && + (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask == 0xffff || Mask == 0xffff0000)) + return SDValue(); + // 2a + unsigned lsb = CountTrailingZeros_32(Mask2); + Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), + DAG.getConstant(lsb, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res, + DAG.getConstant(Mask, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } else if (ARM::isBitFieldInvertedMask(~Mask) && + ARM::isBitFieldInvertedMask(Mask2) && + (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) { + // The pack halfword instruction works better for masks that fit it, + // so use that when it's available. + if (Subtarget->hasT2ExtractPack() && + (Mask2 == 0xffff || Mask2 == 0xffff0000)) + return SDValue(); + // 2b + unsigned lsb = CountTrailingZeros_32(Mask); + Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), + DAG.getConstant(lsb, MVT::i32)); + Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, + DAG.getConstant(Mask2, MVT::i32)); + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, Res, false); + } + } + + return SDValue(); +} + /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // fmrrd(fmdrr x, y) -> x,y + TargetLowering::DAGCombinerInfo &DCI) { + // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); if (InDouble.getOpcode() == ARMISD::VMOVDRR) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); return SDValue(); } +/// PerformVMOVDRRCombine - Target-specific dag combine xforms for +/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. +static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { + // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::BITCAST) + Op1 = Op1.getOperand(0); + if (Op0.getOpcode() == ARMISD::VMOVRRD && + Op0.getNode() == Op1.getNode() && + Op0.getResNo() == 0 && Op1.getResNo() == 1) + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), + N->getValueType(0), Op0.getOperand(0)); + return SDValue(); +} + +/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for +/// ISD::BUILD_VECTOR. +static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG) { + // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): + // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value + // into a pair of GPRs, which is fine when the value is used as a scalar, + // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. + if (N->getNumOperands() == 2) + return PerformVMOVDRRCombine(N, DAG); + + return SDValue(); +} + +/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for +/// ISD::VECTOR_SHUFFLE. +static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { + // The LLVM shufflevector instruction does not require the shuffle mask + // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does + // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the + // operands do not match the mask length, they are extended by concatenating + // them with undef vectors. That is probably the right thing for other + // targets, but for NEON it is better to concatenate two double-register + // size vector operands into a single quad-register size vector. Do that + // transformation here: + // shuffle(concat(v1, undef), concat(v2, undef)) -> + // shuffle(concat(v1, v2), undef) + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::CONCAT_VECTORS || + Op1.getOpcode() != ISD::CONCAT_VECTORS || + Op0.getNumOperands() != 2 || + Op1.getNumOperands() != 2) + return SDValue(); + SDValue Concat0Op1 = Op0.getOperand(1); + SDValue Concat1Op1 = Op1.getOperand(1); + if (Concat0Op1.getOpcode() != ISD::UNDEF || + Concat1Op1.getOpcode() != ISD::UNDEF) + return SDValue(); + // Skip the transformation if any of the types are illegal. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + if (!TLI.isTypeLegal(VT) || + !TLI.isTypeLegal(Concat0Op1.getValueType()) || + !TLI.isTypeLegal(Concat1Op1.getValueType())) + return SDValue(); + + SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, + Op0.getOperand(0), Op1.getOperand(0)); + // Translate the shuffle mask. + SmallVector NewMask; + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfElts = NumElts/2; + ShuffleVectorSDNode *SVN = cast(N); + for (unsigned n = 0; n < NumElts; ++n) { + int MaskElt = SVN->getMaskElt(n); + int NewElt = -1; + if (MaskElt < (int)HalfElts) + NewElt = MaskElt; + else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) + NewElt = HalfElts + MaskElt - NumElts; + NewMask.push_back(NewElt); + } + return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat, + DAG.getUNDEF(VT), NewMask.data()); +} + +/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a +/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic +/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and +/// return true. +static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + // vldN-dup instructions only support 64-bit vectors for N > 1. + if (!VT.is64BitVector()) + return false; + + // Check if the VDUPLANE operand is a vldN-dup intrinsic. + SDNode *VLD = N->getOperand(0).getNode(); + if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + unsigned NumVecs = 0; + unsigned NewOpc = 0; + unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_neon_vld2lane) { + NumVecs = 2; + NewOpc = ARMISD::VLD2DUP; + } else if (IntNo == Intrinsic::arm_neon_vld3lane) { + NumVecs = 3; + NewOpc = ARMISD::VLD3DUP; + } else if (IntNo == Intrinsic::arm_neon_vld4lane) { + NumVecs = 4; + NewOpc = ARMISD::VLD4DUP; + } else { + return false; + } + + // First check that all the vldN-lane uses are VDUPLANEs and that the lane + // numbers match the load. + unsigned VLDLaneNo = + cast(VLD->getOperand(NumVecs+3))->getZExtValue(); + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + // Ignore uses of the chain result. + if (UI.getUse().getResNo() == NumVecs) + continue; + SDNode *User = *UI; + if (User->getOpcode() != ARMISD::VDUPLANE || + VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) + return false; + } + + // Create the vldN-dup node. + EVT Tys[5]; + unsigned n; + for (n = 0; n < NumVecs; ++n) + Tys[n] = VT; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1); + SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; + MemIntrinsicSDNode *VLDMemInt = cast(VLD); + SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys, + Ops, 2, VLDMemInt->getMemoryVT(), + VLDMemInt->getMemOperand()); + + // Update the uses. + for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); + UI != UE; ++UI) { + unsigned ResNo = UI.getUse().getResNo(); + // Ignore uses of the chain result. + if (ResNo == NumVecs) + continue; + SDNode *User = *UI; + DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); + } + + // Now the vldN-lane intrinsic is dead except for its chain result. + // Update uses of the chain. + std::vector VLDDupResults; + for (unsigned n = 0; n < NumVecs; ++n) + VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); + VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); + DCI.CombineTo(VLD, VLDDupResults); + + return true; +} + +/// PerformVDUPLANECombine - Target-specific dag combine xforms for +/// ARMISD::VDUPLANE. +static SDValue PerformVDUPLANECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op = N->getOperand(0); + + // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses + // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. + if (CombineVLDDUP(N, DCI)) + return SDValue(N, 0); + + // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is + // redundant. Ignore bit_converts for now; element sizes are checked below. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) + return SDValue(); + + // Make sure the VMOV element size is not bigger than the VDUPLANE elements. + unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits(); + // The canonical VMOV for a zero vector uses a 32-bit element size. + unsigned Imm = cast(Op.getOperand(0))->getZExtValue(); + unsigned EltBits; + if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) + EltSize = 8; + EVT VT = N->getValueType(0); + if (EltSize > VT.getVectorElementType().getSizeInBits()) + return SDValue(); + + return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); +} + /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. - while (Op.getOpcode() == ISD::BIT_CONVERT) + while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; @@ -3787,7 +5120,8 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; - llvm_unreachable("invalid shift count for narrowing vector shift intrinsic"); + llvm_unreachable("invalid shift count for narrowing vector shift " + "intrinsic"); default: llvm_unreachable("unhandled vector shift"); @@ -3875,7 +5209,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // Nothing to be done for scalar shifts. - if (! VT.isVector()) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); assert(ST->hasNEON() && "unexpected vector shift"); @@ -3921,7 +5256,8 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, if (VT == MVT::i32 && (EltVT == MVT::i8 || EltVT == MVT::i16) && - TLI.isTypeLegal(Vec.getValueType())) { + TLI.isTypeLegal(Vec.getValueType()) && + isa(Lane)) { unsigned Opc = 0; switch (N->getOpcode()) { @@ -3946,7 +5282,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { // If the target supports NEON, try to use vmax/vmin instructions for f32 - // selects like "x < y ? x : y". Unless the FiniteOnlyFPMath option is set, + // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set, // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is // a NaN; only do the transformation when it matches that behavior. @@ -4032,7 +5368,14 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::ADD: return PerformADDCombine(N, DCI); case ISD::SUB: return PerformSUBCombine(N, DCI); + case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); + case ISD::OR: return PerformORCombine(N, DCI, Subtarget); + case ISD::AND: return PerformANDCombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI); + case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); + case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI.DAG); + case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: @@ -4046,17 +5389,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, } bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const { - if (!Subtarget->hasV6Ops()) - // Pre-v6 does not support unaligned mem access. + if (!Subtarget->allowsUnalignedMem()) return false; - else { - // v6+ may or may not support unaligned mem access depending on the system - // configuration. - // FIXME: This is pretty conservative. Should we provide cmdline option to - // control the behaviour? - if (!Subtarget->isTargetDarwin()) - return false; - } switch (VT.getSimpleVT().SimpleTy) { default: @@ -4269,7 +5603,7 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(Imm) != -1; if (Subtarget->isThumb2()) - return ARM_AM::getT2SOImmVal(Imm) != -1; + return ARM_AM::getT2SOImmVal(Imm) != -1; return Imm >= 0 && Imm <= 255; } @@ -4411,9 +5745,11 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); } else return false; @@ -4421,13 +5757,25 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isLegal = false; if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, - isInc, DAG); + isInc, DAG); else isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) return false; + if (Ptr != Base) { + // Swap base ptr and offset to catch more post-index load / store when + // it's legal. In Thumb2 mode, offset must be an immediate. + if (Ptr == Offset && Op->getOpcode() == ISD::ADD && + !Subtarget->isThumb2()) + std::swap(Base, Offset); + + // Post-indexed load / store update the base pointer. + if (Ptr != Base) + return false; + } + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; return true; } @@ -4474,6 +5822,40 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const { return TargetLowering::getConstraintType(Constraint); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +ARMTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + const Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'l': + if (type->isIntegerTy()) { + if (Subtarget->isThumb()) + weight = CW_SpecificReg; + else + weight = CW_Register; + } + break; + case 'w': + if (type->isFloatingPointTy()) + weight = CW_Register; + break; + } + return weight; +} + std::pair ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const { @@ -4498,7 +5880,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } } if (StringRef("{cc}").equals_lower(Constraint)) - return std::make_pair(0U, ARM::CCRRegisterClass); + return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass); return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } @@ -4548,7 +5930,6 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint, /// vector. If it is invalid, don't add anything to Ops. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Constraint, - bool hasMemory, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); @@ -4697,8 +6078,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, Ops.push_back(Result); return; } - return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, - Ops, DAG); + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } bool @@ -4751,6 +6131,21 @@ int ARM::getVFPf64Imm(const APFloat &FPImm) { return ((int)Sign << 7) | (Exp << 4) | Mantissa; } +bool ARM::isBitFieldInvertedMask(unsigned v) { + if (v == 0xffffffff) + return 0; + // there can be 1's on either or both "outsides", all the "inside" + // bits must be 0's + unsigned int lsb = 0, msb = 31; + while (v & (1 << msb)) --msb; + while (v & (1 << lsb)) ++lsb; + for (unsigned int i = lsb; i <= msb; ++i) { + if (v & (1 << i)) + return 0; + } + return 1; +} + /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -4763,3 +6158,63 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return ARM::getVFPf64Imm(Imm) != -1; return false; } + +/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as +/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment +/// specified in the intrinsic calls. +bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + const Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +}