static cl::opt<bool>
EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
- cl::desc("Allow AArch64 SLI/SRI formation"),
- cl::init(false));
+ cl::desc("Allow AArch64 SLI/SRI formation"),
+ cl::init(false));
+// FIXME: The necessary dtprel relocations don't seem to be supported
+// well in the GNU bfd and gold linkers at the moment. Therefore, by
+// default, for now, fall back to GeneralDynamic code generation.
+cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
+ "aarch64-elf-ldtls-generation", cl::Hidden,
+ cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
+ cl::init(false));
-AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
- : TargetLowering(TM) {
- Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
+ const AArch64Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
// we have to make something up. Arbitrarily, choose ZeroOrOne.
}
// Compute derived properties from the register classes
- computeRegisterProperties();
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// Provide all sorts of operation actions
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
- // f16 is storage-only, so we promote operations to f32 if we know this is
- // valid, and ignore them otherwise. The operations not mentioned here will
- // fail to select, but this is not a major problem as no source language
- // should be emitting native f16 operations yet.
- setOperationAction(ISD::FADD, MVT::f16, Promote);
- setOperationAction(ISD::FDIV, MVT::f16, Promote);
- setOperationAction(ISD::FMUL, MVT::f16, Promote);
- setOperationAction(ISD::FSUB, MVT::f16, Promote);
+ // f16 is a storage-only type, always promote it to f32.
+ setOperationAction(ISD::SETCC, MVT::f16, Promote);
+ setOperationAction(ISD::BR_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT, MVT::f16, Promote);
+ setOperationAction(ISD::FADD, MVT::f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::FMA, MVT::f16, Promote);
+ setOperationAction(ISD::FNEG, MVT::f16, Promote);
+ setOperationAction(ISD::FABS, MVT::f16, Promote);
+ setOperationAction(ISD::FCEIL, MVT::f16, Promote);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+ setOperationAction(ISD::FCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
+ setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
+ setOperationAction(ISD::FRINT, MVT::f16, Promote);
+ setOperationAction(ISD::FSIN, MVT::f16, Promote);
+ setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+ setOperationAction(ISD::FSQRT, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP, MVT::f16, Promote);
+ setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+ setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+ setOperationAction(ISD::FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
+ setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
// v4f16 is also a storage-only type, so promote it to v4f32 when that is
// known to be safe.
setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
// AArch64 has implementations of a lot of rounding-like FP operations.
- static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
- for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
- MVT Ty = RoundingTypes[I];
+ for (MVT Ty : {MVT::f32, MVT::f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
}
+ // Make floating-point constants legal for the large code model, so they don't
+ // become loads from the constant pool.
+ if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ }
+
// AArch64 does not have floating-point extending loads, i1 sign-extending
// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+ }
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
// Enable TBZ/TBNZ
MaskAndBranchFoldingIsLegal = true;
+ EnableExtLdPromotion = true;
setMinFunctionAlignment(2);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
- Expand);
-
- setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
- setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
- for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
- setTruncStoreAction((MVT::SimpleValueType)VT,
- (MVT::SimpleValueType)InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
- setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+ setOperationAction(ISD::BSWAP, VT, Expand);
+
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
}
// AArch64 has implementations of a lot of rounding-like FP operations.
- static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
- for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
- MVT Ty = RoundingVecTypes[I];
+ for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
- setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+ for (MVT InnerVT : MVT::all_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
// CNT supports only B element sizes.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
return MVT::i64;
}
-unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
- // FIXME: On AArch64, this depends on the type.
- // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
- // and the offset has to be a multiple of the related size in bytes.
- return 4095;
-}
-
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
- case AArch64ISD::TLSDESC_CALL: return "AArch64ISD::TLSDESC_CALL";
+ case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
case AArch64ISD::ADC: return "AArch64ISD::ADC";
case AArch64ISD::SBC: return "AArch64ISD::SBC";
case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
+ case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
+ case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
+ case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
+ case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
+ case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
+ case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
case AArch64ISD::NOT: return "AArch64ISD::NOT";
case AArch64ISD::BIT: return "AArch64ISD::BIT";
case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
// EndBB:
// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
- const TargetInstrInfo *TII =
- getTargetMachine().getSubtargetImpl()->getInstrInfo();
MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI->getDebugLoc();
MachineFunction::iterator It = MBB;
case ISD::SMULO:
case ISD::UMULO: {
CC = AArch64CC::NE;
- bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+ bool IsSigned = Op.getOpcode() == ISD::SMULO;
if (Op.getValueType() == MVT::i32) {
unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
// For a 32 bit multiply with overflow check we want the instruction
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
- SmallVector<SDValue, 2> Ops;
- for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
- Ops.push_back(Op.getOperand(i));
-
+ SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
SDLoc(Op)).first;
}
if (Op.getOperand(0).getValueType().isVector())
return LowerVectorFP_TO_INT(Op, DAG);
+ // f16 conversions are promoted to f32.
+ if (Op.getOperand(0).getValueType() == MVT::f16) {
+ SDLoc dl(Op);
+ return DAG.getNode(
+ Op.getOpcode(), dl, Op.getValueType(),
+ DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+ }
+
if (Op.getOperand(0).getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
- SmallVector<SDValue, 2> Ops;
- for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
- Ops.push_back(Op.getOperand(i));
-
+ SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
SDLoc(Op)).first;
}
if (Op.getValueType().isVector())
return LowerVectorINT_TO_FP(Op, DAG);
+ // f16 conversions are promoted to f32.
+ if (Op.getValueType() == MVT::f16) {
+ SDLoc dl(Op);
+ return DAG.getNode(
+ ISD::FP_ROUND, dl, MVT::f16,
+ DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+ DAG.getIntPtrConstant(0));
+ }
+
// i128 conversions are libcalls.
if (Op.getOperand(0).getValueType() == MVT::i128)
return SDValue();
llvm_unreachable("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
+ case CallingConv::GHC:
+ return CC_AArch64_GHC;
case CallingConv::C:
case CallingConv::Fast:
if (!Subtarget->isTargetDarwin())
unsigned CurArgIdx = 0;
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Ins[i].VT;
- std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
- CurArgIdx = Ins[i].OrigArgIndex;
-
- // Get type of the original argument.
- EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
- MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
- // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
- if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
- ValVT = MVT::i8;
- else if (ActualMVT == MVT::i16)
- ValVT = MVT::i16;
+ if (Ins[i].isOrigArg()) {
+ std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+ CurArgIdx = Ins[i].getOrigArgIndex();
+ // Get type of the original argument.
+ EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+ MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+ // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+ if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+ ValVT = MVT::i8;
+ else if (ActualMVT == MVT::i16)
+ ValVT = MVT::i16;
+ }
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res =
AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
uint32_t BEAlign = 0;
- if (ArgSize < 8 && !Subtarget->isLittleEndian())
+ if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+ !Ins[i].Flags.isInConsecutiveRegs())
BEAlign = 8 - ArgSize;
int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7 };
static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
- unsigned FirstVariadicGPR =
- CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
+ unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
- unsigned FirstVariadicFPR =
- CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+ unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
// cannot rely on the linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
- if (GV->hasExternalWeakLinkage())
+ const Triple TT(getTargetMachine().getTargetTriple());
+ if (GV->hasExternalWeakLinkage() &&
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
return false;
}
unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
: VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
- if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+ if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+ !Flags.isInConsecutiveRegs()) {
if (OpSize < 8)
BEAlign = 8 - OpSize;
}
DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
- /*isVol = */ false,
- /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
+ /*isVol = */ false, /*AlwaysInline = */ false,
+ /*isTailCall = */ false,
+ DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
- const TargetRegisterInfo *TRI =
- getTargetMachine().getSubtargetImpl()->getRegisterInfo();
- const AArch64RegisterInfo *ARI =
- static_cast<const AArch64RegisterInfo *>(TRI);
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
if (IsThisReturn) {
// For 'this' returns, use the X0-preserving mask if applicable
- Mask = ARI->getThisReturnPreservedMask(CallConv);
+ Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
if (!Mask) {
IsThisReturn = false;
- Mask = ARI->getCallPreservedMask(CallConv);
+ Mask = TRI->getCallPreservedMask(MF, CallConv);
}
} else
- Mask = ARI->getCallPreservedMask(CallConv);
+ Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
- const TargetRegisterInfo *TRI =
- getTargetMachine().getSubtargetImpl()->getRegisterInfo();
- const AArch64RegisterInfo *ARI =
- static_cast<const AArch64RegisterInfo *>(TRI);
- const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+ const uint32_t *Mask =
+ Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
// Finally, we can make the call. This is just a degenerate version of a
// normal AArch64 call node: x0 takes the address of the descriptor, and
/// When accessing thread-local variables under either the general-dynamic or
/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, NZCV) are preserved.
-///
-/// Thus, the ideal call sequence on AArch64 is:
-///
-/// adrp x0, :tlsdesc:thread_var
-/// ldr x8, [x0, :tlsdesc_lo12:thread_var]
-/// add x0, x0, :tlsdesc_lo12:thread_var
-/// .tlsdesccall thread_var
-/// blr x8
-/// (TPIDR_EL0 offset now in x0).
+/// is a function pointer to carry out the resolution.
///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
+/// The sequence is:
+/// adrp x0, :tlsdesc:var
+/// ldr x1, [x0, #:tlsdesc_lo12:var]
+/// add x0, x0, #:tlsdesc_lo12:var
+/// .tlsdesccall var
+/// blr x1
+/// (TPIDR_EL0 offset now in x0)
///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
- SDValue DescAddr, SDLoc DL,
- SelectionDAG &DAG) const {
+/// The above sequence must be produced unscheduled, to enable the linker to
+/// optimize/relax this sequence.
+/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
+/// above sequence, and expanded really late in the compilation flow, to ensure
+/// the sequence is produced as per above.
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+ SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy();
- // The function we need to call is simply the first entry in the GOT for this
- // descriptor, load it in preparation.
- SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+ SDValue Chain = DAG.getEntryNode();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- // TLS calls preserve all registers except those that absolutely must be
- // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
- // silly).
- const TargetRegisterInfo *TRI =
- getTargetMachine().getSubtargetImpl()->getRegisterInfo();
- const AArch64RegisterInfo *ARI =
- static_cast<const AArch64RegisterInfo *>(TRI);
- const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
- // The function takes only one argument: the address of the descriptor itself
- // in X0.
- SDValue Glue, Chain;
- Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
- Glue = Chain.getValue(1);
-
- // We're now ready to populate the argument list, as with a normal call:
- SmallVector<SDValue, 6> Ops;
+ SmallVector<SDValue, 2> Ops;
Ops.push_back(Chain);
- Ops.push_back(Func);
Ops.push_back(SymAddr);
- Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
- Ops.push_back(DAG.getRegisterMask(Mask));
- Ops.push_back(Glue);
- SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
- Glue = Chain.getValue(1);
+ Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+ SDValue Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
}
assert(Subtarget->isTargetELF() && "This function expects an ELF target");
assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
"ELF TLS only supported in small memory model");
+ // Different choices can be made for the maximum size of the TLS area for a
+ // module. For the small address model, the default TLS size is 16MiB and the
+ // maximum TLS size is 4GiB.
+ // FIXME: add -mtls-size command line option and make it control the 16MiB
+ // vs. 4GiB code sequence generation.
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+ if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
+ if (Model == TLSModel::LocalDynamic)
+ Model = TLSModel::GeneralDynamic;
+ }
SDValue TPOff;
EVT PtrVT = getPointerTy();
if (Model == TLSModel::LocalExec) {
SDValue HiVar = DAG.getTargetGlobalAddress(
- GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
- AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
- TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
- DAG.getTargetConstant(16, MVT::i32)),
- 0);
- TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
- DAG.getTargetConstant(0, MVT::i32)),
- 0);
+ SDValue TPWithOff_lo =
+ SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+ HiVar, DAG.getTargetConstant(0, MVT::i32)),
+ 0);
+ SDValue TPWithOff =
+ SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
+ LoVar, DAG.getTargetConstant(0, MVT::i32)),
+ 0);
+ return TPWithOff;
} else if (Model == TLSModel::InitialExec) {
TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
- // Accesses used in this sequence go via the TLS descriptor which lives in
- // the GOT. Prepare an address we can use to handle this.
- SDValue HiDesc = DAG.getTargetExternalSymbol(
- "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
- SDValue LoDesc = DAG.getTargetExternalSymbol(
- "_TLS_MODULE_BASE_", PtrVT,
- AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
- // First argument to the descriptor call is the address of the descriptor
- // itself.
- SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
- DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
// Now we can calculate the offset from TPIDR_EL0 to this module's
// thread-local area.
- TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+ TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
// Now use :dtprel_whatever: operations to calculate this variable's offset
// in its thread-storage area.
SDValue HiVar = DAG.getTargetGlobalAddress(
- GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+ GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0,
- AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
-
- SDValue DTPOff =
- SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
- DAG.getTargetConstant(16, MVT::i32)),
- 0);
- DTPOff =
- SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
- DAG.getTargetConstant(0, MVT::i32)),
- 0);
-
- TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
- } else if (Model == TLSModel::GeneralDynamic) {
- // Accesses used in this sequence go via the TLS descriptor which lives in
- // the GOT. Prepare an address we can use to handle this.
- SDValue HiDesc = DAG.getTargetGlobalAddress(
- GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
- SDValue LoDesc = DAG.getTargetGlobalAddress(
- GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
- // First argument to the descriptor call is the address of the descriptor
- // itself.
- SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
- DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
+ TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
+ DAG.getTargetConstant(0, MVT::i32)),
+ 0);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
+ DAG.getTargetConstant(0, MVT::i32)),
+ 0);
+ } else if (Model == TLSModel::GeneralDynamic) {
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
// Finally we can make a call to calculate the offset from tpidr_el0.
- TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+ TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
} else
llvm_unreachable("Unsupported ELF TLS access model");
OFCC = getInvertedCondCode(OFCC);
SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
- return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
- CCVal, Overflow);
+ return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
}
if (LHS.getValueType().isInteger()) {
EVT VecVT;
EVT EltVT;
- SDValue EltMask, VecVal1, VecVal2;
+ uint64_t EltMask;
+ SDValue VecVal1, VecVal2;
if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
EltVT = MVT::i32;
VecVT = MVT::v4i32;
- EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+ EltMask = 0x80000000ULL;
if (!VT.isVector()) {
VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
// We want to materialize a mask with the the high bit set, but the AdvSIMD
// immediate moves cannot materialize that in a single instruction for
// 64-bit elements. Instead, materialize zero and then negate it.
- EltMask = DAG.getConstant(0, EltVT);
+ EltMask = 0;
if (!VT.isVector()) {
VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
llvm_unreachable("Invalid type for copysign!");
}
- std::vector<SDValue> BuildVectorOps;
- for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
- BuildVectorOps.push_back(EltMask);
-
- SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
+ SDValue BuildVec = DAG.getConstant(EltMask, VecVT);
// If we couldn't materialize the mask above, then the mask vector will be
// the zero vector, and we need to negate it here.
}
SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
- if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+ if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::NoImplicitFloat))
return SDValue();
if (!Subtarget->hasNEON())
SDValue Val = Op.getOperand(0);
SDLoc DL(Op);
EVT VT = Op.getValueType();
- SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
- SDValue VecVal;
- if (VT == MVT::i32) {
- VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
- VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
- VecVal);
- } else {
- VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
- }
+ if (VT == MVT::i32)
+ Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
- SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+ SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue UaddLV = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
}
-SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue CC = Op->getOperand(0);
- SDValue TVal = Op->getOperand(1);
- SDValue FVal = Op->getOperand(2);
- SDLoc DL(Op);
-
- unsigned Opc = CC.getOpcode();
- // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
- // instruction.
- if (CC.getResNo() == 1 &&
- (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
- Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
- // Only lower legal XALUO ops.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
- return SDValue();
-
- AArch64CC::CondCode OFCC;
- SDValue Value, Overflow;
- std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
- SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
- return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
- CCVal, Overflow);
- }
-
- if (CC.getOpcode() == ISD::SETCC)
- return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
- cast<CondCodeSDNode>(CC.getOperand(2))->get());
- else
- return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
- FVal, ISD::SETNE);
-}
-
-SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
+ SDValue RHS, SDValue TVal,
+ SDValue FVal, SDLoc dl,
SelectionDAG &DAG) const {
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue TVal = Op.getOperand(2);
- SDValue FVal = Op.getOperand(3);
- SDLoc dl(Op);
-
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
if (LHS.getValueType() == MVT::f128) {
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
- EVT VT = Op.getValueType();
+ EVT VT = TVal.getValueType();
return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
}
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
assert(LHS.getValueType() == RHS.getValueType());
- EVT VT = Op.getValueType();
+ EVT VT = TVal.getValueType();
// Try to match this select into a max/min operation, which have dedicated
// opcode in the instruction set.
return CS1;
}
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+ SelectionDAG &DAG) const {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue TVal = Op.getOperand(2);
+ SDValue FVal = Op.getOperand(3);
+ SDLoc DL(Op);
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue CCVal = Op->getOperand(0);
+ SDValue TVal = Op->getOperand(1);
+ SDValue FVal = Op->getOperand(2);
+ SDLoc DL(Op);
+
+ unsigned Opc = CCVal.getOpcode();
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+ // instruction.
+ if (CCVal.getResNo() == 1 &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+ // Only lower legal XALUO ops.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
+ return SDValue();
+
+ AArch64CC::CondCode OFCC;
+ SDValue Value, Overflow;
+ std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
+ SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+ return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+ CCVal, Overflow);
+ }
+
+ // Lower it the same way as we would lower a SELECT_CC node.
+ ISD::CondCode CC;
+ SDValue LHS, RHS;
+ if (CCVal.getOpcode() == ISD::SETCC) {
+ LHS = CCVal.getOperand(0);
+ RHS = CCVal.getOperand(1);
+ CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
+ } else {
+ LHS = CCVal;
+ RHS = DAG.getConstant(0, CCVal.getValueType());
+ CC = ISD::SETNE;
+ }
+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
SelectionDAG &DAG) const {
// Jump table entries as PC relative offsets. No additional tweaking
return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
- 8, false, false, MachinePointerInfo(DestSV),
+ 8, false, false, false, MachinePointerInfo(DestSV),
MachinePointerInfo(SrcSV));
}
std::pair<unsigned, const TargetRegisterClass *>
AArch64TargetLowering::getRegForInlineAsmConstraint(
- const std::string &Constraint, MVT VT) const {
+ const TargetRegisterInfo *TRI, const std::string &Constraint,
+ MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass *> Res;
- Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
// The extraction can just take the second half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
- DAG.getIntPtrConstant(NumSrcElts));
+ DAG.getConstant(NumSrcElts, MVT::i64));
Src.WindowBase = -NumSrcElts;
} else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
- Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
- Src.ShuffleVec, DAG.getIntPtrConstant(0));
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, MVT::i64));
} else {
// An actual VEXT is needed
- SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
- Src.ShuffleVec, DAG.getIntPtrConstant(0));
+ SDValue VEXTSrc1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, MVT::i64));
SDValue VEXTSrc2 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
- DAG.getIntPtrConstant(NumSrcElts));
+ DAG.getConstant(NumSrcElts, MVT::i64));
unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
unsigned Opcode;
if (EltTy == MVT::i8)
Opcode = AArch64ISD::DUPLANE8;
- else if (EltTy == MVT::i16)
+ else if (EltTy == MVT::i16 || EltTy == MVT::f16)
Opcode = AArch64ISD::DUPLANE16;
else if (EltTy == MVT::i32 || EltTy == MVT::f32)
Opcode = AArch64ISD::DUPLANE32;
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
- MVT NewType =
- (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+ EVT EltTy = VT.getVectorElementType();
+ assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
+ "Unsupported floating-point vector type");
+ MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
AArch64CC::CondCode CC, bool NoNans, EVT VT,
SDLoc dl, SelectionDAG &DAG) {
EVT SrcVT = LHS.getValueType();
+ assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
+ "function only supposed to emit natural comparisons");
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
APInt CnstBits(VT.getSizeInBits(), 0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
+ EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
SDLoc dl(Op);
if (LHS.getValueType().getVectorElementType().isInteger()) {
assert(LHS.getValueType() == RHS.getValueType());
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
- return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
- dl, DAG);
+ SDValue Cmp =
+ EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
+ return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
}
assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
SDValue Cmp =
- EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+ EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
if (!Cmp.getNode())
return SDValue();
if (CC2 != AArch64CC::AL) {
SDValue Cmp2 =
- EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+ EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
if (!Cmp2.getNode())
return SDValue();
- Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+ Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
}
+ Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+
if (ShouldInvert)
return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
return NumBits1 > NumBits2;
}
+/// Check if it is profitable to hoist instruction in then/else to if.
+/// Not profitable if I and it's user can form a FMA instruction
+/// because we prefer FMSUB/FMADD.
+bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
+ if (I->getOpcode() != Instruction::FMul)
+ return true;
+
+ if (I->getNumUses() != 1)
+ return true;
+
+ Instruction *User = I->user_back();
+
+ if (User &&
+ !(User->getOpcode() == Instruction::FSub ||
+ User->getOpcode() == Instruction::FAdd))
+ return true;
+
+ const TargetOptions &Options = getTargetMachine().Options;
+ EVT VT = getValueType(User->getOperand(0)->getType());
+
+ if (isFMAFasterThanFMulAndFAdd(VT) &&
+ isOperationLegalOrCustom(ISD::FMA, VT) &&
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
+ return false;
+
+ return true;
+}
+
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
// 64-bit GPR.
bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
VT1.getSizeInBits() <= 32);
}
+bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
+ if (isa<FPExtInst>(Ext))
+ return false;
+
+ // Vector types are next free.
+ if (Ext->getType()->isVectorTy())
+ return false;
+
+ for (const Use &U : Ext->uses()) {
+ // The extension is free if we can fold it with a left shift in an
+ // addressing mode or an arithmetic operation: add, sub, and cmp.
+
+ // Is there a shift?
+ const Instruction *Instr = cast<Instruction>(U.getUser());
+
+ // Is this a constant shift?
+ switch (Instr->getOpcode()) {
+ case Instruction::Shl:
+ if (!isa<ConstantInt>(Instr->getOperand(1)))
+ return false;
+ break;
+ case Instruction::GetElementPtr: {
+ gep_type_iterator GTI = gep_type_begin(Instr);
+ std::advance(GTI, U.getOperandNo());
+ Type *IdxTy = *GTI;
+ // This extension will end up with a shift because of the scaling factor.
+ // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
+ // Get the shift amount based on the scaling factor:
+ // log2(sizeof(IdxTy)) - log2(8).
+ uint64_t ShiftAmt =
+ countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3;
+ // Is the constant foldable in the shift of the addressing mode?
+ // I.e., shift amount is between 1 and 4 inclusive.
+ if (ShiftAmt == 0 || ShiftAmt > 4)
+ return false;
+ break;
+ }
+ case Instruction::Trunc:
+ // Check if this is a noop.
+ // trunc(sext ty1 to ty2) to ty1.
+ if (Instr->getType() == Ext->getOperand(0)->getType())
+ continue;
+ // FALL THROUGH.
+ default:
+ return false;
+ }
+
+ // At this point we can use the bfm family, so this extension is free
+ // for that use.
+ }
+ return true;
+}
+
bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
unsigned &RequiredAligment) const {
if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
bool Fast;
const Function *F = MF.getFunction();
if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
- !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat) &&
+ !F->hasFnAttribute(Attribute::NoImplicitFloat) &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
(allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
return MVT::f128;
- return Size >= 8 ? MVT::i64 : MVT::i32;
+ if (Size >= 8 &&
+ (memOpAlign(SrcAlign, DstAlign, 8) ||
+ (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
+ return MVT::i64;
+
+ if (Size >= 4 &&
+ (memOpAlign(SrcAlign, DstAlign, 4) ||
+ (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
+ return MVT::i32;
+
+ return MVT::Other;
}
// 12-bit optionally shifted immediates are legal for adds.
unsigned LZ = countLeadingZeros((uint64_t)Val);
unsigned Shift = (63 - LZ) / 16;
// MOVZ is free so return true for one or fewer MOVK.
- return (Shift < 3) ? true : false;
+ return Shift < 3;
}
// Generate SUBS and CSEL for integer abs.
N->getOperand(0));
}
} else {
+ // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+ APInt VNP1 = -Value + 1;
+ if (VNP1.isPowerOf2()) {
+ SDValue ShiftedVal =
+ DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+ DAG.getConstant(VNP1.logBase2(), MVT::i64));
+ return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
+ ShiftedVal);
+ }
// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
APInt VNM1 = -Value - 1;
if (VNM1.isPowerOf2()) {
DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
}
- // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
- APInt VNP1 = -Value + 1;
- if (VNP1.isPowerOf2()) {
- SDValue ShiftedVal =
- DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
- DAG.getConstant(VNP1.logBase2(), MVT::i64));
- return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
- ShiftedVal);
- }
}
}
return SDValue();
return SDValue();
}
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
// This eliminates an "integer-to-vector-move UOP and improve throughput.
SDValue N0 = N->getOperand(0);
- if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+ if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+ // Optimize concat_vectors of truncated vectors, where the intermediate
+ // type is illegal, to avoid said illegality, e.g.,
+ // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
+ // (v2i16 (truncate (v2i64)))))
+ // ->
+ // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
+ // (v4i32 (bitcast (v2i64))),
+ // <0, 2, 4, 6>)))
+ // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
+ // on both input and result type, so we might generate worse code.
+ // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
+ if (N->getNumOperands() == 2 &&
+ N0->getOpcode() == ISD::TRUNCATE &&
+ N1->getOpcode() == ISD::TRUNCATE) {
+ SDValue N00 = N0->getOperand(0);
+ SDValue N10 = N1->getOperand(0);
+ EVT N00VT = N00.getValueType();
+
+ if (N00VT == N10.getValueType() &&
+ (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
+ N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
+ MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
+ SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
+ for (size_t i = 0; i < Mask.size(); ++i)
+ Mask[i] = i * 2;
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getVectorShuffle(
+ MidVT, dl,
+ DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
+ DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
+ }
+ }
+
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDLoc dl(N);
- EVT VT = N->getValueType(0);
-
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
- if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+ if (N0 == N1 && VT.getVectorNumElements() == 2) {
assert(VT.getVectorElementType().getSizeInBits() == 64);
- return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
- WidenVector(N->getOperand(0), DAG),
+ return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
DAG.getConstant(0, MVT::i64));
}
// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
- SDValue Op1 = N->getOperand(1);
- if (Op1->getOpcode() != ISD::BITCAST)
+ if (N1->getOpcode() != ISD::BITCAST)
return SDValue();
- SDValue RHS = Op1->getOperand(0);
+ SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();
// If the RHS is not a vector, this is not the pattern we're looking for.
if (!RHSTy.isVector())
MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
RHSTy.getVectorNumElements() * 2);
- return DAG.getNode(
- ISD::BITCAST, dl, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
- DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+ DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
+ RHS));
}
static SDValue tryCombineFixedPointConvert(SDNode *N,
N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
}
+static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
+ SelectionDAG &DAG) {
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
+ DAG.getNode(Opc, SDLoc(N),
+ N->getOperand(1).getSimpleValueType(),
+ N->getOperand(1)),
+ DAG.getConstant(0, MVT::i64));
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
break;
+ case Intrinsic::aarch64_neon_saddv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
+ case Intrinsic::aarch64_neon_uaddv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
+ case Intrinsic::aarch64_neon_sminv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
+ case Intrinsic::aarch64_neon_uminv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
+ case Intrinsic::aarch64_neon_smaxv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
+ case Intrinsic::aarch64_neon_umaxv:
+ return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
case Intrinsic::aarch64_neon_fmax:
return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
LoVT.getVectorNumElements());
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getIntPtrConstant(0));
+ DAG.getConstant(0, MVT::i64));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+ DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
return SDValue();
// Cyclone has bad performance on unaligned 16B stores when crossing line and
- // page boundries. We want to split such stores.
+ // page boundaries. We want to split such stores.
if (!Subtarget->isCyclone())
return SDValue();
// Don't split at Oz.
MachineFunction &MF = DAG.getMachineFunction();
- bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::MinSize);
+ bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize);
if (IsMinSize)
return SDValue();
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
- DAG.getIntPtrConstant(0));
+ DAG.getConstant(0, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
- DAG.getIntPtrConstant(NumElts));
+ DAG.getConstant(NumElts, MVT::i64));
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
}
+ // Finally, check that the vector doesn't depend on the load.
+ // Again, this would create a cycle.
+ // The load depending on the vector is fine, as that's the case for the
+ // LD1*post we'll eventually generate anyway.
+ if (LoadSDN->isPredecessorOf(Vector.getNode()))
+ continue;
+
SmallVector<SDValue, 8> Ops;
Ops.push_back(LD->getOperand(0)); // Chain
if (IsLaneOp) {
LoadSDN->getMemOperand());
// Update the uses.
- std::vector<SDValue> NewResults;
+ SmallVector<SDValue, 2> NewResults;
NewResults.push_back(SDValue(LD, 0)); // The result of load
NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
DCI.CombineTo(LD, NewResults);
return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
- return performIntToFpCombine(N, DAG);
+ return performIntToFpCombine(N, DAG, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
-bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+TargetLoweringBase::AtomicRMWExpansionKind
+AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
- return Size <= 128;
+ return Size <= 128 ? AtomicRMWExpansionKind::LLSC
+ : AtomicRMWExpansionKind::None;
}
bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {